diff --git a/Cargo.lock b/Cargo.lock
index f6138e14ae641f2c9c4951e3fa2cbb9373ac3d33..2673dd53a2fe475dbb36da0f99d0554d2c4419ff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "addr2line"
-version = "0.22.0"
+version = "0.24.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
 dependencies = [
  "gimli",
 ]
@@ -17,6 +17,12 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
 [[package]]
 name = "ahash"
 version = "0.8.11"
@@ -46,11 +52,17 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1"
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
+
 [[package]]
 name = "anstream"
-version = "0.6.14"
+version = "0.6.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
+checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338"
 dependencies = [
  "anstyle",
  "anstyle-parse",
@@ -63,43 +75,43 @@ dependencies = [
 
 [[package]]
 name = "anstyle"
-version = "1.0.7"
+version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
+checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.4"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
+checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
 dependencies = [
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle-query"
-version = "1.1.0"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
+checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
 dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.3"
+version = "3.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
+checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
 dependencies = [
  "anstyle",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.86"
+version = "1.0.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8"
 
 [[package]]
 name = "arbitrary"
@@ -121,14 +133,14 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "arrayvec"
-version = "0.7.4"
+version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
 [[package]]
 name = "async-rustls"
@@ -143,9 +155,9 @@ dependencies = [
 
 [[package]]
 name = "async-stream"
-version = "0.3.5"
+version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51"
+checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
 dependencies = [
  "async-stream-impl",
  "futures-core",
@@ -154,24 +166,24 @@ dependencies = [
 
 [[package]]
 name = "async-stream-impl"
-version = "0.3.5"
+version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
+checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "async-trait"
-version = "0.1.80"
+version = "0.1.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca"
+checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -180,11 +192,22 @@ version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
 
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi 0.1.19",
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "autocfg"
-version = "1.3.0"
+version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
+checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 
 [[package]]
 name = "av1-grain"
@@ -213,9 +236,9 @@ dependencies = [
 
 [[package]]
 name = "avif-serialize"
-version = "0.8.1"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "876c75a42f6364451a033496a14c44bffe41f5f4a8236f697391f11024e596d2"
+checksum = "e335041290c43101ca215eed6f43ec437eb5a42125573f600fc3fa42b9bddd62"
 dependencies = [
  "arrayvec",
 ]
@@ -234,9 +257,9 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-rs"
-version = "1.7.3"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf7d844e282b4b56750b2d4e893b2205581ded8709fddd2b6aa5418c150ca877"
+checksum = "cdd82dba44d209fddb11c190e0a94b78651f95299598e472215667417a03ff1d"
 dependencies = [
  "aws-lc-sys",
  "mirai-annotations",
@@ -246,9 +269,9 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-sys"
-version = "0.18.0"
+version = "0.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3a2c29203f6bf296d01141cc8bb9dbd5ecd4c27843f2ee0767bcd5985a927da"
+checksum = "df7a4168111d7eb622a31b214057b8509c0a7e1794f44c546d742330dc793972"
 dependencies = [
  "bindgen",
  "cc",
@@ -272,7 +295,7 @@ dependencies = [
  "futures-util",
  "http 0.2.12",
  "http-body 0.4.6",
- "hyper 0.14.29",
+ "hyper 0.14.31",
  "itoa",
  "matchit",
  "memchr",
@@ -286,25 +309,25 @@ dependencies = [
  "serde_urlencoded",
  "sync_wrapper 0.1.2",
  "tokio",
- "tower",
+ "tower 0.4.13",
  "tower-layer",
  "tower-service",
 ]
 
 [[package]]
 name = "axum"
-version = "0.7.5"
+version = "0.7.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
+checksum = "504e3947307ac8326a5437504c517c4b56716c9d98fac0028c2acc7ca47d70ae"
 dependencies = [
  "async-trait",
- "axum-core 0.4.3",
+ "axum-core 0.4.5",
  "bytes",
  "futures-util",
  "http 1.1.0",
- "http-body 1.0.0",
+ "http-body 1.0.1",
  "http-body-util",
- "hyper 1.3.1",
+ "hyper 1.5.0",
  "hyper-util",
  "itoa",
  "matchit",
@@ -319,7 +342,7 @@ dependencies = [
  "serde_urlencoded",
  "sync_wrapper 1.0.1",
  "tokio",
- "tower",
+ "tower 0.5.1",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -344,20 +367,20 @@ dependencies = [
 
 [[package]]
 name = "axum-core"
-version = "0.4.3"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a15c63fd72d41492dc4f497196f5da1fb04fb7529e631d73630d1b491e47a2e3"
+checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
 dependencies = [
  "async-trait",
  "bytes",
  "futures-util",
  "http 1.1.0",
- "http-body 1.0.0",
+ "http-body 1.0.1",
  "http-body-util",
  "mime",
  "pin-project-lite",
  "rustversion",
- "sync_wrapper 0.1.2",
+ "sync_wrapper 1.0.1",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -369,13 +392,13 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bdad298231394729042d1f155b93f9fdf0b5ee1aea0b62404c4d7341f7d8fe08"
 dependencies = [
- "axum 0.7.5",
+ "axum 0.7.7",
  "futures-core",
  "futures-util",
  "http 1.1.0",
  "opentelemetry 0.21.0",
  "pin-project-lite",
- "tower",
+ "tower 0.4.13",
  "tracing",
  "tracing-opentelemetry 0.22.0",
  "tracing-opentelemetry-instrumentation-sdk",
@@ -383,17 +406,17 @@ dependencies = [
 
 [[package]]
 name = "backtrace"
-version = "0.3.73"
+version = "0.3.74"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
+checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
 dependencies = [
  "addr2line",
- "cc",
  "cfg-if",
  "libc",
- "miniz_oxide",
+ "miniz_oxide 0.8.0",
  "object",
  "rustc-demangle",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -416,9 +439,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
 name = "bindgen"
-version = "0.69.4"
+version = "0.69.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
+checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
 dependencies = [
  "bitflags 2.6.0",
  "cexpr",
@@ -433,7 +456,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.68",
+ "syn 2.0.85",
  "which",
 ]
 
@@ -472,9 +495,9 @@ checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
 [[package]]
 name = "bitstream-io"
-version = "2.4.2"
+version = "2.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "415f8399438eb5e4b2f73ed3152a3448b98149dda642a957ee704e1daa5cf1d8"
+checksum = "b81e1519b0d82120d2fd469d5bfb2919a9361c48b02d82d04befc1cdd2002452"
 
 [[package]]
 name = "block-buffer"
@@ -487,9 +510,9 @@ dependencies = [
 
 [[package]]
 name = "built"
-version = "0.7.3"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6a6c0b39c38fd754ac338b00a88066436389c0f029da5d37d1e01091d9b7c17"
+checksum = "c360505aed52b7ec96a3636c3f039d99103c37d1d9b4f7a8c743d3ea9ffcd03b"
 
 [[package]]
 name = "bumpalo"
@@ -505,9 +528,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
 
 [[package]]
 name = "bytemuck"
-version = "1.16.1"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
+checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d"
 
 [[package]]
 name = "byteorder"
@@ -523,15 +546,15 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
 
 [[package]]
 name = "bytes"
-version = "1.6.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
+checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da"
 
 [[package]]
 name = "camino"
-version = "1.1.7"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0ec6b951b160caa93cc0c7b209e5a3bff7aae9062213451ac99493cd844c239"
+checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3"
 dependencies = [
  "serde",
 ]
@@ -565,15 +588,30 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
 
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
+[[package]]
+name = "castaway"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5"
+dependencies = [
+ "rustversion",
+]
+
 [[package]]
 name = "cc"
-version = "1.0.101"
+version = "1.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d"
+checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f"
 dependencies = [
  "jobserver",
  "libc",
- "once_cell",
+ "shlex",
 ]
 
 [[package]]
@@ -607,6 +645,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
 
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@@ -620,9 +664,20 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.7"
+version = "2.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
+dependencies = [
+ "bitflags 1.3.2",
+ "textwrap",
+ "unicode-width",
+]
+
+[[package]]
+name = "clap"
+version = "4.5.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f"
+checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -630,9 +685,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.7"
+version = "4.5.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f"
+checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
 dependencies = [
  "anstream",
  "anstyle",
@@ -642,31 +697,41 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.5"
+version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
+checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "clap_lex"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
+checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
 
 [[package]]
 name = "cmake"
-version = "0.1.50"
+version = "0.1.51"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130"
+checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
 dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "codespan-reporting"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
+dependencies = [
+ "termcolor",
+ "unicode-width",
+]
+
 [[package]]
 name = "color_quant"
 version = "1.1.0"
@@ -675,9 +740,23 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorchoice"
-version = "1.0.1"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
+checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
+
+[[package]]
+name = "compact_str"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
+dependencies = [
+ "castaway",
+ "cfg-if",
+ "itoa",
+ "rustversion",
+ "ryu",
+ "static_assertions",
+]
 
 [[package]]
 name = "console"
@@ -704,15 +783,15 @@ dependencies = [
 
 [[package]]
 name = "core-foundation-sys"
-version = "0.8.6"
+version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.12"
+version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
+checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0"
 dependencies = [
  "libc",
 ]
@@ -726,6 +805,42 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "criterion"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f"
+dependencies = [
+ "atty",
+ "cast",
+ "clap 2.34.0",
+ "criterion-plot",
+ "csv",
+ "itertools 0.10.5",
+ "lazy_static",
+ "num-traits",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_cbor",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876"
+dependencies = [
+ "cast",
+ "itertools 0.10.5",
+]
+
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.13"
@@ -762,15 +877,15 @@ checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
 
 [[package]]
 name = "crossterm"
-version = "0.27.0"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
+checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6"
 dependencies = [
  "bitflags 2.6.0",
  "crossterm_winapi",
- "libc",
  "mio",
  "parking_lot",
+ "rustix",
  "signal-hook",
  "signal-hook-mio",
  "winapi",
@@ -801,21 +916,86 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "csv"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "ctrlc"
-version = "3.4.4"
+version = "3.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "672465ae37dc1bc6380a6547a8883d5dd397b0f1faaad4f265726cc7042a5345"
+checksum = "90eeab0aa92f3f9b4e87f258c72b139c207d251f9cbc1080a0086b86a8870dd3"
 dependencies = [
- "nix",
- "windows-sys 0.52.0",
+ "nix 0.29.0",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "cxx"
+version = "1.0.129"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cbdc8cca144dce1c4981b5c9ab748761619979e515c3d53b5df385c677d1d007"
+dependencies = [
+ "cc",
+ "cxxbridge-flags",
+ "cxxbridge-macro",
+ "link-cplusplus",
+]
+
+[[package]]
+name = "cxx-build"
+version = "1.0.129"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5764c3142ab44fcf857101d12c0ddf09c34499900557c764f5ad0597159d1fc"
+dependencies = [
+ "cc",
+ "codespan-reporting",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "scratch",
+ "syn 2.0.85",
+]
+
+[[package]]
+name = "cxxbridge-flags"
+version = "1.0.129"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d422aff542b4fa28c2ce8e5cc202d42dbf24702345c1fba3087b2d3f8a1b90ff"
+
+[[package]]
+name = "cxxbridge-macro"
+version = "1.0.129"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1719100f31492cd6adeeab9a0f46cdbc846e615fdb66d7b398aa46ec7fdd06f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "darling"
-version = "0.20.9"
+version = "0.20.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1"
+checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
 dependencies = [
  "darling_core",
  "darling_macro",
@@ -823,27 +1003,27 @@ dependencies = [
 
 [[package]]
 name = "darling_core"
-version = "0.20.9"
+version = "0.20.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120"
+checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
 dependencies = [
  "fnv",
  "ident_case",
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "darling_macro"
-version = "0.20.9"
+version = "0.20.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178"
+checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -857,33 +1037,33 @@ dependencies = [
 
 [[package]]
 name = "derive_builder"
-version = "0.20.0"
+version = "0.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
+checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
 dependencies = [
  "derive_builder_macro",
 ]
 
 [[package]]
 name = "derive_builder_core"
-version = "0.20.0"
+version = "0.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
+checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "derive_builder_macro"
-version = "0.20.0"
+version = "0.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
+checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -919,9 +1099,9 @@ dependencies = [
 
 [[package]]
 name = "dunce"
-version = "1.0.4"
+version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
 
 [[package]]
 name = "easy-cast"
@@ -946,9 +1126,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
 
 [[package]]
 name = "encoding_rs"
-version = "0.8.34"
+version = "0.8.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
 dependencies = [
  "cfg-if",
 ]
@@ -986,9 +1166,9 @@ checksum = "887d93f60543e9a9362ef8a21beedd0a833c5d9610e18c67abe15a5963dcb1a4"
 dependencies = [
  "bit_field",
  "flume",
- "half",
+ "half 2.4.1",
  "lebe",
- "miniz_oxide",
+ "miniz_oxide 0.7.4",
  "rayon-core",
  "smallvec",
  "zune-inflate",
@@ -1006,15 +1186,15 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.1.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
+checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
 
 [[package]]
 name = "fdeflate"
-version = "0.3.4"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f9bfee30e4dedf0ab8b422f03af778d9612b63f502710fc500a334ebe2de645"
+checksum = "d8090f921a24b04994d9929e204f50b498a33ea6ba559ffaa05e04f7ee7fb5ab"
 dependencies = [
  "simd-adler32",
 ]
@@ -1027,12 +1207,12 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
 [[package]]
 name = "flate2"
-version = "1.0.30"
+version = "1.0.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
+checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0"
 dependencies = [
  "crc32fast",
- "miniz_oxide",
+ "miniz_oxide 0.8.0",
 ]
 
 [[package]]
@@ -1049,9 +1229,9 @@ checksum = "28a80e3145d8ad11ba0995949bbcf48b9df2be62772b3d351ef017dff6ecb853"
 
 [[package]]
 name = "flume"
-version = "0.11.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181"
+checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095"
 dependencies = [
  "spin 0.9.8",
 ]
@@ -1062,6 +1242,12 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "foldhash"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"
+
 [[package]]
 name = "foreign-types"
 version = "0.3.2"
@@ -1104,9 +1290,9 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
 
 [[package]]
 name = "futures"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1119,9 +1305,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -1129,15 +1315,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
+checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -1146,38 +1332,38 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "futures-sink"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
 
 [[package]]
 name = "futures-task"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
 
 [[package]]
 name = "futures-util"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1235,9 +1421,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.29.0"
+version = "0.31.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
 
 [[package]]
 name = "glob"
@@ -1267,7 +1453,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "http 0.2.12",
- "indexmap 2.2.6",
+ "indexmap 2.6.0",
  "slab",
  "tokio",
  "tokio-util",
@@ -1276,9 +1462,9 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.4.5"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab"
+checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205"
 dependencies = [
  "atomic-waker",
  "bytes",
@@ -1286,13 +1472,19 @@ dependencies = [
  "futures-core",
  "futures-sink",
  "http 1.1.0",
- "indexmap 2.2.6",
+ "indexmap 2.6.0",
  "slab",
  "tokio",
  "tokio-util",
  "tracing",
 ]
 
+[[package]]
+name = "half"
+version = "1.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403"
+
 [[package]]
 name = "half"
 version = "2.4.1"
@@ -1316,6 +1508,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
  "ahash",
+ "allocator-api2",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
 ]
 
 [[package]]
@@ -1330,6 +1534,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "hermit-abi"
 version = "0.3.9"
@@ -1412,9 +1625,9 @@ dependencies = [
 
 [[package]]
 name = "http-body"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
 dependencies = [
  "bytes",
  "http 1.1.0",
@@ -1429,15 +1642,15 @@ dependencies = [
  "bytes",
  "futures-util",
  "http 1.1.0",
- "http-body 1.0.0",
+ "http-body 1.0.1",
  "pin-project-lite",
 ]
 
 [[package]]
 name = "httparse"
-version = "1.9.4"
+version = "1.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
+checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946"
 
 [[package]]
 name = "httpdate"
@@ -1447,9 +1660,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 
 [[package]]
 name = "hyper"
-version = "0.14.29"
+version = "0.14.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33"
+checksum = "8c08302e8fa335b151b788c775ff56e7a03ae64ff85c548ee820fecb70356e85"
 dependencies = [
  "bytes",
  "futures-channel",
@@ -1471,16 +1684,16 @@ dependencies = [
 
 [[package]]
 name = "hyper"
-version = "1.3.1"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d"
+checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a"
 dependencies = [
  "bytes",
  "futures-channel",
  "futures-util",
- "h2 0.4.5",
+ "h2 0.4.6",
  "http 1.1.0",
- "http-body 1.0.0",
+ "http-body 1.0.1",
  "httparse",
  "httpdate",
  "itoa",
@@ -1492,16 +1705,16 @@ dependencies = [
 
 [[package]]
 name = "hyper-rustls"
-version = "0.27.2"
+version = "0.27.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155"
+checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333"
 dependencies = [
  "futures-util",
  "http 1.1.0",
- "hyper 1.3.1",
+ "hyper 1.5.0",
  "hyper-util",
  "log",
- "rustls 0.23.10",
+ "rustls 0.23.15",
  "rustls-native-certs",
  "rustls-pki-types",
  "tokio",
@@ -1515,7 +1728,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper 0.14.29",
+ "hyper 0.14.31",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
@@ -1528,7 +1741,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
 dependencies = [
  "bytes",
- "hyper 0.14.29",
+ "hyper 0.14.31",
  "native-tls",
  "tokio",
  "tokio-native-tls",
@@ -1536,20 +1749,19 @@ dependencies = [
 
 [[package]]
 name = "hyper-util"
-version = "0.1.5"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56"
+checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b"
 dependencies = [
  "bytes",
  "futures-channel",
  "futures-util",
  "http 1.1.0",
- "http-body 1.0.0",
- "hyper 1.3.1",
+ "http-body 1.0.1",
+ "hyper 1.5.0",
  "pin-project-lite",
  "socket2",
  "tokio",
- "tower",
  "tower-service",
  "tracing",
 ]
@@ -1572,12 +1784,12 @@ dependencies = [
 
 [[package]]
 name = "image"
-version = "0.25.1"
+version = "0.25.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd54d660e773627692c524beaad361aca785a4f9f5730ce91f42aabe5bce3d11"
+checksum = "bc144d44a31d753b02ce64093d532f55ff8dc4ebf2ffb8a63c0dda691385acae"
 dependencies = [
  "bytemuck",
- "byteorder",
+ "byteorder-lite",
  "color_quant",
  "exr",
  "gif",
@@ -1595,19 +1807,19 @@ dependencies = [
 
 [[package]]
 name = "image-webp"
-version = "0.1.2"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d730b085583c4d789dfd07fdcf185be59501666a90c97c40162b37e4fdad272d"
+checksum = "e031e8e3d94711a9ccb5d6ea357439ef3dcbed361798bd4071dc4d9793fbe22f"
 dependencies = [
  "byteorder-lite",
- "thiserror",
+ "quick-error",
 ]
 
 [[package]]
 name = "imgref"
-version = "1.10.1"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44feda355f4159a7c757171a77de25daf6411e217b4cabd03bd6650690468126"
+checksum = "d0263a3d970d5c054ed9312c0057b4f3bde9c0b33836d3637361d4a9e6e7a408"
 
 [[package]]
 name = "indexmap"
@@ -1621,12 +1833,12 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.2.6"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
+checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
 dependencies = [
  "equivalent",
- "hashbrown 0.14.5",
+ "hashbrown 0.15.0",
  "serde",
 ]
 
@@ -1662,6 +1874,16 @@ dependencies = [
  "tracing-opentelemetry 0.21.0",
 ]
 
+[[package]]
+name = "instability"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c"
+dependencies = [
+ "quote",
+ "syn 2.0.85",
+]
+
 [[package]]
 name = "instant"
 version = "0.1.13"
@@ -1679,20 +1901,20 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "ipnet"
-version = "2.9.0"
+version = "2.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
+checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
 
 [[package]]
 name = "is_terminal_polyfill"
-version = "1.70.0"
+version = "1.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
 
 [[package]]
 name = "iso8601"
@@ -1730,6 +1952,15 @@ dependencies = [
  "either",
 ]
 
+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.11"
@@ -1738,9 +1969,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jobserver"
-version = "0.1.31"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
+checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
 dependencies = [
  "libc",
 ]
@@ -1753,9 +1984,9 @@ checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0"
 
 [[package]]
 name = "js-sys"
-version = "0.3.69"
+version = "0.3.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -1770,7 +2001,7 @@ dependencies = [
  "anyhow",
  "base64 0.21.7",
  "bytecount",
- "clap",
+ "clap 4.5.20",
  "fancy-regex",
  "fraction",
  "getrandom",
@@ -1810,9 +2041,9 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 
 [[package]]
 name = "libc"
-version = "0.2.155"
+version = "0.2.161"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
 
 [[package]]
 name = "libfuzzer-sys"
@@ -1827,12 +2058,12 @@ dependencies = [
 
 [[package]]
 name = "libloading"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e310b3a6b5907f99202fcdb4960ff45b93735d7c7d96b760fcff8db2dc0e103d"
+checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.5",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -1851,6 +2082,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "link-cplusplus"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d240c6f7e1ba3a28b0249f774e6a9dd0175054b52dfbb61b16eb8505c3785c9"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.14"
@@ -1869,9 +2109,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.21"
+version = "0.4.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
 
 [[package]]
 name = "loop9"
@@ -1882,6 +2122,15 @@ dependencies = [
  "imgref",
 ]
 
+[[package]]
+name = "lru"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
+dependencies = [
+ "hashbrown 0.15.0",
+]
+
 [[package]]
 name = "macro_rules_attribute"
 version = "0.2.0"
@@ -1926,7 +2175,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519"
 dependencies = [
  "cfg-if",
- "rayon",
 ]
 
 [[package]]
@@ -1936,14 +2184,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
 [[package]]
-name = "metrics"
-version = "0.21.1"
+name = "memoffset"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fde3af1a009ed76a778cb84fdef9e7dbbdf5775ae3e4cc1f434a6a307f6f76c5"
+checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
 dependencies = [
- "ahash",
- "metrics-macros",
- "portable-atomic",
+ "autocfg",
 ]
 
 [[package]]
@@ -1958,18 +2204,18 @@ dependencies = [
 
 [[package]]
 name = "metrics-exporter-prometheus"
-version = "0.15.1"
+version = "0.15.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf0af7a0d7ced10c0151f870e5e3f3f8bc9ffc5992d32873566ca1f9169ae776"
+checksum = "b4f0c8427b39666bf970460908b213ec09b3b350f20c0c2eabcbba51704a08e6"
 dependencies = [
  "base64 0.22.1",
  "http-body-util",
- "hyper 1.3.1",
+ "hyper 1.5.0",
  "hyper-rustls",
  "hyper-util",
- "indexmap 2.2.6",
+ "indexmap 2.6.0",
  "ipnet",
- "metrics 0.23.0",
+ "metrics",
  "metrics-util",
  "quanta",
  "thiserror",
@@ -1977,17 +2223,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "metrics-macros"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38b4faf00617defe497754acde3024865bc143d44a86799b24e191ecff91354f"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.68",
-]
-
 [[package]]
 name = "metrics-util"
 version = "0.17.0"
@@ -1997,7 +2232,7 @@ dependencies = [
  "crossbeam-epoch",
  "crossbeam-utils",
  "hashbrown 0.14.5",
- "metrics 0.23.0",
+ "metrics",
  "num_cpus",
  "quanta",
  "sketches-ddsketch",
@@ -2011,9 +2246,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
 [[package]]
 name = "mime_guess"
-version = "2.0.4"
+version = "2.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
+checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e"
 dependencies = [
  "mime",
  "unicase",
@@ -2021,18 +2256,19 @@ dependencies = [
 
 [[package]]
 name = "minijinja"
-version = "2.0.2"
+version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e136ef580d7955019ab0a407b68d77c292a9976907e217900f3f76bc8f6dc1a4"
+checksum = "c9ca8daf4b0b4029777f1bc6e1aedd1aec7b74c276a43bc6f620a8e1a1c0a90e"
 dependencies = [
  "serde",
+ "serde_json",
 ]
 
 [[package]]
 name = "minijinja-contrib"
-version = "2.0.2"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15ee37078c98d31e510d6a7af488031a2c3ccacdb76c5c4fc98ddfe6d0e9da07"
+checksum = "39ffd46ee854be23604a20efd6c9655374fefbe4d44b949dc0f907305d92873a"
 dependencies = [
  "minijinja",
  "serde",
@@ -2051,19 +2287,29 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
 dependencies = [
  "adler",
+]
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
+dependencies = [
+ "adler2",
  "simd-adler32",
 ]
 
 [[package]]
 name = "mio"
-version = "0.8.11"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
 dependencies = [
+ "hermit-abi 0.3.9",
  "libc",
  "log",
  "wasi",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2090,7 +2336,7 @@ checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -2156,7 +2402,7 @@ dependencies = [
  "bytes",
  "futures",
  "hostname",
- "hyper 0.14.29",
+ "hyper 0.14.31",
  "muxado",
  "once_cell",
  "parking_lot",
@@ -2180,7 +2426,19 @@ checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
 dependencies = [
  "bitflags 2.6.0",
  "cfg-if",
- "cfg_aliases",
+ "cfg_aliases 0.1.1",
+ "libc",
+]
+
+[[package]]
+name = "nix"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
+dependencies = [
+ "bitflags 2.6.0",
+ "cfg-if",
+ "cfg_aliases 0.2.1",
  "libc",
 ]
 
@@ -2241,9 +2499,9 @@ dependencies = [
 
 [[package]]
 name = "num-bigint"
-version = "0.4.5"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
 dependencies = [
  "num-integer",
  "num-traits",
@@ -2278,7 +2536,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -2328,7 +2586,7 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.3.9",
  "libc",
 ]
 
@@ -2349,18 +2607,18 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
 [[package]]
 name = "object"
-version = "0.36.0"
+version = "0.36.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434"
+checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "once_cell"
-version = "1.19.0"
+version = "1.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 
 [[package]]
 name = "onig"
@@ -2384,11 +2642,17 @@ dependencies = [
  "pkg-config",
 ]
 
+[[package]]
+name = "oorandom"
+version = "11.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
+
 [[package]]
 name = "openssl"
-version = "0.10.64"
+version = "0.10.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
+checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5"
 dependencies = [
  "bitflags 2.6.0",
  "cfg-if",
@@ -2407,7 +2671,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -2418,9 +2682,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.102"
+version = "0.9.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
+checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741"
 dependencies = [
  "cc",
  "libc",
@@ -2446,7 +2710,7 @@ checksum = "1e32339a5dc40459130b3bd269e9892439f55b33e772d2a9d402a789baaf4e8a"
 dependencies = [
  "futures-core",
  "futures-sink",
- "indexmap 2.2.6",
+ "indexmap 2.6.0",
  "js-sys",
  "once_cell",
  "pin-project-lite",
@@ -2454,6 +2718,20 @@ dependencies = [
  "urlencoding",
 ]
 
+[[package]]
+name = "opentelemetry"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror",
+]
+
 [[package]]
 name = "opentelemetry-otlp"
 version = "0.13.0"
@@ -2547,7 +2825,25 @@ dependencies = [
  "glob",
  "once_cell",
  "opentelemetry 0.21.0",
- "ordered-float 4.2.0",
+ "ordered-float 4.4.0",
+ "percent-encoding",
+ "rand",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.24.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
+dependencies = [
+ "async-trait",
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "glob",
+ "once_cell",
+ "opentelemetry 0.24.0",
  "percent-encoding",
  "rand",
  "thiserror",
@@ -2570,9 +2866,9 @@ dependencies = [
 
 [[package]]
 name = "ordered-float"
-version = "4.2.0"
+version = "4.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e"
+checksum = "83e7ccb95e240b7c9506a3d544f10d935e142cc90b0a1d56954fb44d89ad6b97"
 dependencies = [
  "num-traits",
 ]
@@ -2614,7 +2910,7 @@ dependencies = [
  "libc",
  "redox_syscall",
  "smallvec",
- "windows-targets 0.52.5",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -2636,34 +2932,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
  "fixedbitset",
- "indexmap 2.2.6",
+ "indexmap 2.6.0",
 ]
 
 [[package]]
 name = "pin-project"
-version = "1.1.5"
+version = "1.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3"
+checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.5"
+version = "1.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
+checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.14"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
+checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff"
 
 [[package]]
 name = "pin-utils"
@@ -2673,28 +2969,56 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
+checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
+
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
 
 [[package]]
 name = "png"
-version = "0.17.13"
+version = "0.17.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06e4b0d3d1312775e782c86c91a111aa1f910cbb65e1337f9975b5f9a554b5e1"
+checksum = "52f9d46a34a05a6a57566bc2bfae066ef07585a6e3fa30fbbdff5936380623f0"
 dependencies = [
  "bitflags 1.3.2",
  "crc32fast",
  "fdeflate",
  "flate2",
- "miniz_oxide",
+ "miniz_oxide 0.8.0",
 ]
 
 [[package]]
 name = "portable-atomic"
-version = "1.6.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
+checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2"
 
 [[package]]
 name = "powerfmt"
@@ -2704,18 +3028,21 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
 
 [[package]]
 name = "ppv-lite86"
-version = "0.2.17"
+version = "0.2.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
+dependencies = [
+ "zerocopy",
+]
 
 [[package]]
 name = "prettyplease"
-version = "0.2.20"
+version = "0.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e"
+checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033"
 dependencies = [
  "proc-macro2",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -2744,30 +3071,30 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.86"
+version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
+checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "profiling"
-version = "1.0.15"
+version = "1.0.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43d84d1d7a6ac92673717f9f6d1518374ef257669c24ebc5ac25d5033828be58"
+checksum = "afbdc74edc00b6f6a218ca6a5364d6226a259d4b8ea1af4a0ea063f27e179f4d"
 dependencies = [
  "profiling-procmacros",
 ]
 
 [[package]]
 name = "profiling-procmacros"
-version = "1.0.15"
+version = "1.0.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8021cf59c8ec9c432cfc2526ac6b8aa508ecaf29cd415f271b8406c1b851c3fd"
+checksum = "a65f2e60fbf1063868558d69c6beacf412dc755f9fc020f514b7955fc914fe30"
 dependencies = [
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -2807,7 +3134,7 @@ dependencies = [
  "prost 0.12.6",
  "prost-types",
  "regex",
- "syn 2.0.68",
+ "syn 2.0.85",
  "tempfile",
 ]
 
@@ -2834,7 +3161,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -2846,6 +3173,69 @@ dependencies = [
  "prost 0.12.6",
 ]
 
+[[package]]
+name = "pyo3"
+version = "0.22.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51"
+dependencies = [
+ "cfg-if",
+ "indoc",
+ "libc",
+ "memoffset",
+ "once_cell",
+ "portable-atomic",
+ "pyo3-build-config",
+ "pyo3-ffi",
+ "pyo3-macros",
+ "unindent",
+]
+
+[[package]]
+name = "pyo3-build-config"
+version = "0.22.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179"
+dependencies = [
+ "once_cell",
+ "target-lexicon",
+]
+
+[[package]]
+name = "pyo3-ffi"
+version = "0.22.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d"
+dependencies = [
+ "libc",
+ "pyo3-build-config",
+]
+
+[[package]]
+name = "pyo3-macros"
+version = "0.22.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e"
+dependencies = [
+ "proc-macro2",
+ "pyo3-macros-backend",
+ "quote",
+ "syn 2.0.85",
+]
+
+[[package]]
+name = "pyo3-macros-backend"
+version = "0.22.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "pyo3-build-config",
+ "quote",
+ "syn 2.0.85",
+]
+
 [[package]]
 name = "qoi"
 version = "0.4.1"
@@ -2878,9 +3268,9 @@ checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
 
 [[package]]
 name = "quote"
-version = "1.0.36"
+version = "1.0.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 dependencies = [
  "proc-macro2",
 ]
@@ -2917,18 +3307,22 @@ dependencies = [
 
 [[package]]
 name = "ratatui"
-version = "0.23.0"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e2e4cd95294a85c3b4446e63ef054eea43e0205b1fd60120c16b74ff7ff96ad"
+checksum = "fdef7f9be5c0122f890d58bdf4d964349ba6a6161f705907526d891efabba57d"
 dependencies = [
  "bitflags 2.6.0",
  "cassowary",
+ "compact_str",
  "crossterm",
- "indoc",
- "itertools 0.11.0",
+ "instability",
+ "itertools 0.13.0",
+ "lru",
  "paste",
  "strum",
+ "strum_macros",
  "unicode-segmentation",
+ "unicode-truncate",
  "unicode-width",
 ]
 
@@ -2969,24 +3363,23 @@ dependencies = [
 
 [[package]]
 name = "ravif"
-version = "0.11.7"
+version = "0.11.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67376f469e7e7840d0040bbf4b9b3334005bb167f814621326e4c7ab8cd6e944"
+checksum = "2413fd96bd0ea5cdeeb37eaf446a22e6ed7b981d792828721e74ded1980a45c6"
 dependencies = [
  "avif-serialize",
  "imgref",
  "loop9",
  "quick-error",
  "rav1e",
- "rayon",
  "rgb",
 ]
 
 [[package]]
 name = "raw-cpuid"
-version = "11.0.2"
+version = "11.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e29830cbb1290e404f24c73af91c5d8d631ce7e128691e9477556b540cd01ecd"
+checksum = "1ab240315c661615f2ee9f0f2cd32d5a7343a84d5ebcccb99d46e6637565e7b0"
 dependencies = [
  "bitflags 2.6.0",
 ]
@@ -3024,18 +3417,18 @@ dependencies = [
 
 [[package]]
 name = "redox_syscall"
-version = "0.5.2"
+version = "0.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd"
+checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f"
 dependencies = [
  "bitflags 2.6.0",
 ]
 
 [[package]]
 name = "redox_users"
-version = "0.4.5"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891"
+checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
 dependencies = [
  "getrandom",
  "libredox",
@@ -3044,14 +3437,14 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.10.5"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata 0.4.7",
- "regex-syntax 0.8.4",
+ "regex-automata 0.4.8",
+ "regex-syntax 0.8.5",
 ]
 
 [[package]]
@@ -3065,13 +3458,13 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
+checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax 0.8.4",
+ "regex-syntax 0.8.5",
 ]
 
 [[package]]
@@ -3082,9 +3475,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
 [[package]]
 name = "regex-syntax"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
 
 [[package]]
 name = "reqwest"
@@ -3100,7 +3493,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.12",
  "http-body 0.4.6",
- "hyper 0.14.29",
+ "hyper 0.14.31",
  "hyper-tls",
  "ipnet",
  "js-sys",
@@ -3128,12 +3521,9 @@ dependencies = [
 
 [[package]]
 name = "rgb"
-version = "0.8.37"
+version = "0.8.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05aaa8004b64fd573fc9d002f4e632d51ad4f026c2b5ba95fcb6c2f32c2c47d8"
-dependencies = [
- "bytemuck",
-]
+checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a"
 
 [[package]]
 name = "ring"
@@ -3167,9 +3557,9 @@ dependencies = [
 
 [[package]]
 name = "rust-embed"
-version = "8.4.0"
+version = "8.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19549741604902eb99a7ed0ee177a0663ee1eda51a29f71401f166e47e77806a"
+checksum = "fa66af4a4fdd5e7ebc276f115e895611a34739a9c1c01028383d612d550953c0"
 dependencies = [
  "rust-embed-impl",
  "rust-embed-utils",
@@ -3178,22 +3568,22 @@ dependencies = [
 
 [[package]]
 name = "rust-embed-impl"
-version = "8.4.0"
+version = "8.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb9f96e283ec64401f30d3df8ee2aaeb2561f34c824381efa24a35f79bf40ee4"
+checksum = "6125dbc8867951125eec87294137f4e9c2c96566e61bf72c45095a7c77761478"
 dependencies = [
  "proc-macro2",
  "quote",
  "rust-embed-utils",
- "syn 2.0.68",
+ "syn 2.0.85",
  "walkdir",
 ]
 
 [[package]]
 name = "rust-embed-utils"
-version = "8.4.0"
+version = "8.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38c74a686185620830701348de757fd36bef4aa9680fd23c49fc539ddcc1af32"
+checksum = "2e5347777e9aacb56039b0e1f28785929a8a3b709e87482e7442c72e7c12529d"
 dependencies = [
  "sha2",
  "walkdir",
@@ -3213,18 +3603,18 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
 [[package]]
 name = "rustc_version"
-version = "0.4.0"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
 dependencies = [
  "semver",
 ]
 
 [[package]]
 name = "rustix"
-version = "0.38.34"
+version = "0.38.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
+checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811"
 dependencies = [
  "bitflags 2.6.0",
  "errno",
@@ -3261,9 +3651,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.23.10"
+version = "0.23.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05cff451f60db80f490f3c182b77c35260baace73209e9cdbbe526bfe3a4d402"
+checksum = "5fbb44d7acc4e873d613422379f69f237a1b141928c02f6bc6ccfddddc2d7993"
 dependencies = [
  "aws-lc-rs",
  "log",
@@ -3276,12 +3666,12 @@ dependencies = [
 
 [[package]]
 name = "rustls-native-certs"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
+checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a"
 dependencies = [
  "openssl-probe",
- "rustls-pemfile 2.1.2",
+ "rustls-pemfile 2.2.0",
  "rustls-pki-types",
  "schannel",
  "security-framework",
@@ -3298,25 +3688,24 @@ dependencies = [
 
 [[package]]
 name = "rustls-pemfile"
-version = "2.1.2"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d"
+checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
 dependencies = [
- "base64 0.22.1",
  "rustls-pki-types",
 ]
 
 [[package]]
 name = "rustls-pki-types"
-version = "1.7.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d"
+checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
 
 [[package]]
 name = "rustls-webpki"
-version = "0.102.4"
+version = "0.102.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff448f7e92e913c4b7d4c6d8e4540a1724b319b4152b8aef6d4cf8339712b33e"
+checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
 dependencies = [
  "aws-lc-rs",
  "ring 0.17.8",
@@ -3326,9 +3715,9 @@ dependencies = [
 
 [[package]]
 name = "rustversion"
-version = "1.0.17"
+version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
+checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248"
 
 [[package]]
 name = "ryu"
@@ -3347,11 +3736,11 @@ dependencies = [
 
 [[package]]
 name = "schannel"
-version = "0.1.23"
+version = "0.1.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
+checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1"
 dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3360,6 +3749,12 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
+[[package]]
+name = "scratch"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3cf7c11c38cb994f3d40e8a8cde3bbd1f72a435e4c49e85d6553d8312306152"
+
 [[package]]
 name = "sct"
 version = "0.7.1"
@@ -3372,9 +3767,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework"
-version = "2.11.0"
+version = "2.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0"
+checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
 dependencies = [
  "bitflags 2.6.0",
  "core-foundation",
@@ -3385,9 +3780,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.11.0"
+version = "2.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7"
+checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -3404,31 +3799,42 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.203"
+version = "1.0.213"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
+checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1"
 dependencies = [
  "serde_derive",
 ]
 
+[[package]]
+name = "serde_cbor"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
+dependencies = [
+ "half 1.8.3",
+ "serde",
+]
+
 [[package]]
 name = "serde_derive"
-version = "1.0.203"
+version = "1.0.213"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
+checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.118"
+version = "1.0.132"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4"
+checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
 dependencies = [
  "itoa",
+ "memchr",
  "ryu",
  "serde",
 ]
@@ -3445,9 +3851,9 @@ dependencies = [
 
 [[package]]
 name = "serde_spanned"
-version = "0.6.6"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
+checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1"
 dependencies = [
  "serde",
 ]
@@ -3502,9 +3908,9 @@ dependencies = [
 
 [[package]]
 name = "signal-hook-mio"
-version = "0.2.3"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
+checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd"
 dependencies = [
  "libc",
  "mio",
@@ -3602,6 +4008,12 @@ dependencies = [
  "unicode-segmentation",
 ]
 
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
 [[package]]
 name = "strsim"
 version = "0.11.1"
@@ -3610,24 +4022,24 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
 [[package]]
 name = "strum"
-version = "0.25.0"
+version = "0.26.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
 dependencies = [
  "strum_macros",
 ]
 
 [[package]]
 name = "strum_macros"
-version = "0.25.3"
+version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck 0.4.1",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -3649,9 +4061,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.68"
+version = "2.0.85"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
+checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3672,15 +4084,16 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
 
 [[package]]
 name = "sysinfo"
-version = "0.30.12"
+version = "0.30.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "732ffa00f53e6b2af46208fba5718d9662a421049204e156328b66791ffa15ae"
+checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3"
 dependencies = [
  "cfg-if",
  "core-foundation-sys",
  "libc",
  "ntapi",
  "once_cell",
+ "rayon",
  "windows",
 ]
 
@@ -3744,29 +4157,62 @@ dependencies = [
 
 [[package]]
 name = "target-lexicon"
-version = "0.12.14"
+version = "0.12.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f"
+checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
 
 [[package]]
 name = "tempfile"
-version = "3.10.1"
+version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
+checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b"
 dependencies = [
  "cfg-if",
  "fastrand",
+ "once_cell",
  "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "termcolor"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "text-generation-backends-trtllm"
+version = "2.4.0"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "clap 4.5.20",
+ "cmake",
+ "cxx",
+ "cxx-build",
+ "hashbrown 0.14.5",
+ "hf-hub",
+ "log",
+ "pkg-config",
+ "text-generation-router",
+ "thiserror",
+ "tokenizers",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+ "tracing-opentelemetry 0.25.0",
+ "tracing-subscriber",
 ]
 
 [[package]]
 name = "text-generation-benchmark"
-version = "2.1.1"
+version = "2.4.0"
 dependencies = [
  "average",
- "clap",
- "crossterm",
+ "clap 4.5.20",
  "float-ord",
  "hf-hub",
  "ratatui",
@@ -3783,7 +4229,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "2.1.1"
+version = "2.4.0"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -3795,20 +4241,22 @@ dependencies = [
  "tokio",
  "tonic 0.10.2",
  "tonic-build",
- "tower",
+ "tower 0.4.13",
  "tracing",
 ]
 
 [[package]]
 name = "text-generation-launcher"
-version = "2.1.1"
+version = "2.4.0"
 dependencies = [
- "clap",
+ "clap 4.5.20",
  "ctrlc",
  "float_eq",
  "hf-hub",
- "nix",
+ "nix 0.28.0",
  "once_cell",
+ "pyo3",
+ "regex",
  "reqwest",
  "serde",
  "serde_json",
@@ -3820,13 +4268,15 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "2.1.1"
+version = "2.4.0"
 dependencies = [
  "async-stream",
- "axum 0.7.5",
+ "async-trait",
+ "axum 0.7.7",
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
- "clap",
+ "clap 4.5.20",
+ "csv",
  "futures",
  "futures-util",
  "hf-hub",
@@ -3834,7 +4284,7 @@ dependencies = [
  "init-tracing-opentelemetry",
  "itertools 0.10.5",
  "jsonschema",
- "metrics 0.21.1",
+ "metrics",
  "metrics-exporter-prometheus",
  "minijinja",
  "minijinja-contrib",
@@ -3843,12 +4293,13 @@ dependencies = [
  "once_cell",
  "opentelemetry 0.20.0",
  "opentelemetry-otlp",
+ "pyo3",
  "rand",
  "regex",
  "reqwest",
  "serde",
  "serde_json",
- "text-generation-client",
+ "sysinfo",
  "thiserror",
  "tokenizers",
  "tokio",
@@ -3857,29 +4308,140 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry 0.21.0",
  "tracing-subscriber",
+ "ureq",
  "utoipa",
  "utoipa-swagger-ui",
+ "uuid",
  "vergen",
 ]
 
+[[package]]
+name = "text-generation-router-v2"
+version = "2.4.0"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum 0.7.7",
+ "axum-tracing-opentelemetry",
+ "base64 0.22.1",
+ "clap 4.5.20",
+ "futures",
+ "futures-util",
+ "grpc-metadata",
+ "hf-hub",
+ "image",
+ "init-tracing-opentelemetry",
+ "jsonschema",
+ "metrics",
+ "metrics-exporter-prometheus",
+ "minijinja",
+ "minijinja-contrib",
+ "nohash-hasher",
+ "once_cell",
+ "opentelemetry 0.20.0",
+ "opentelemetry-otlp",
+ "prost 0.12.6",
+ "prost-build",
+ "rand",
+ "regex",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "slotmap",
+ "text-generation-router",
+ "thiserror",
+ "tokenizers",
+ "tokio",
+ "tokio-stream",
+ "tonic 0.10.2",
+ "tonic-build",
+ "tower 0.4.13",
+ "tower-http",
+ "tracing",
+ "tracing-opentelemetry 0.21.0",
+ "tracing-subscriber",
+ "utoipa",
+ "utoipa-swagger-ui",
+]
+
+[[package]]
+name = "text-generation-router-v3"
+version = "2.4.0"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum 0.7.7",
+ "axum-tracing-opentelemetry",
+ "base64 0.22.1",
+ "clap 4.5.20",
+ "criterion",
+ "futures",
+ "futures-util",
+ "grpc-metadata",
+ "hf-hub",
+ "image",
+ "init-tracing-opentelemetry",
+ "itertools 0.13.0",
+ "jsonschema",
+ "metrics",
+ "metrics-exporter-prometheus",
+ "minijinja",
+ "minijinja-contrib",
+ "nohash-hasher",
+ "once_cell",
+ "opentelemetry 0.20.0",
+ "opentelemetry-otlp",
+ "prost 0.12.6",
+ "prost-build",
+ "rand",
+ "regex",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "slotmap",
+ "text-generation-router",
+ "thiserror",
+ "tokenizers",
+ "tokio",
+ "tokio-stream",
+ "tonic 0.10.2",
+ "tonic-build",
+ "tower 0.4.13",
+ "tower-http",
+ "tracing",
+ "tracing-opentelemetry 0.21.0",
+ "tracing-subscriber",
+ "utoipa",
+ "utoipa-swagger-ui",
+]
+
+[[package]]
+name = "textwrap"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
+dependencies = [
+ "unicode-width",
+]
+
 [[package]]
 name = "thiserror"
-version = "1.0.61"
+version = "1.0.65"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
+checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.61"
+version = "1.0.65"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
+checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -3936,11 +4498,21 @@ dependencies = [
  "time-core",
 ]
 
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "tinyvec"
-version = "1.6.1"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82"
+checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -3953,9 +4525,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokenizers"
-version = "0.19.1"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
+checksum = "b172ffa9a2e5c31bbddc940cd5725d933ced983a9333bbebc4c7eda3bbce1557"
 dependencies = [
  "aho-corasick",
  "derive_builder",
@@ -3974,7 +4546,7 @@ dependencies = [
  "rayon",
  "rayon-cond",
  "regex",
- "regex-syntax 0.8.4",
+ "regex-syntax 0.8.5",
  "serde",
  "serde_json",
  "spm_precompiled",
@@ -3986,21 +4558,20 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.38.0"
+version = "1.41.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
+checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb"
 dependencies = [
  "backtrace",
  "bytes",
  "libc",
  "mio",
- "num_cpus",
  "parking_lot",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -4015,13 +4586,13 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.3.0"
+version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
+checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -4051,16 +4622,16 @@ version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
 dependencies = [
- "rustls 0.23.10",
+ "rustls 0.23.15",
  "rustls-pki-types",
  "tokio",
 ]
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.15"
+version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af"
+checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1"
 dependencies = [
  "futures-core",
  "pin-project-lite",
@@ -4069,9 +4640,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.11"
+version = "0.7.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1"
+checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a"
 dependencies = [
  "bytes",
  "futures-core",
@@ -4083,9 +4654,9 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.8.14"
+version = "0.8.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335"
+checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
 dependencies = [
  "serde",
  "serde_spanned",
@@ -4095,20 +4666,20 @@ dependencies = [
 
 [[package]]
 name = "toml_datetime"
-version = "0.6.6"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
+checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
 dependencies = [
  "serde",
 ]
 
 [[package]]
 name = "toml_edit"
-version = "0.22.14"
+version = "0.22.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
+checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
 dependencies = [
- "indexmap 2.2.6",
+ "indexmap 2.6.0",
  "serde",
  "serde_spanned",
  "toml_datetime",
@@ -4130,14 +4701,14 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.12",
  "http-body 0.4.6",
- "hyper 0.14.29",
+ "hyper 0.14.31",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
  "prost 0.11.9",
  "tokio",
  "tokio-stream",
- "tower",
+ "tower 0.4.13",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -4157,14 +4728,14 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.12",
  "http-body 0.4.6",
- "hyper 0.14.29",
+ "hyper 0.14.31",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
  "prost 0.12.6",
  "tokio",
  "tokio-stream",
- "tower",
+ "tower 0.4.13",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -4180,7 +4751,7 @@ dependencies = [
  "proc-macro2",
  "prost-build",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -4203,6 +4774,22 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "tower"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2873938d487c3cfb9aed7546dc9f2711d867c9f90c46b889989a2cb84eba6b4f"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper 0.1.2",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "tower-http"
 version = "0.5.2"
@@ -4212,7 +4799,7 @@ dependencies = [
  "bitflags 2.6.0",
  "bytes",
  "http 1.1.0",
- "http-body 1.0.0",
+ "http-body 1.0.1",
  "http-body-util",
  "pin-project-lite",
  "tower-layer",
@@ -4221,15 +4808,15 @@ dependencies = [
 
 [[package]]
 name = "tower-layer"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
 
 [[package]]
 name = "tower-service"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
 
 [[package]]
 name = "tracing"
@@ -4251,7 +4838,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -4317,7 +4904,25 @@ dependencies = [
  "tracing-core",
  "tracing-log 0.2.0",
  "tracing-subscriber",
- "web-time",
+ "web-time 0.2.4",
+]
+
+[[package]]
+name = "tracing-opentelemetry"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
+dependencies = [
+ "js-sys",
+ "once_cell",
+ "opentelemetry 0.24.0",
+ "opentelemetry_sdk 0.24.1",
+ "smallvec",
+ "tracing",
+ "tracing-core",
+ "tracing-log 0.2.0",
+ "tracing-subscriber",
+ "web-time 1.1.0",
 ]
 
 [[package]]
@@ -4377,30 +4982,27 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
 
 [[package]]
 name = "unicase"
-version = "2.7.0"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
-dependencies = [
- "version_check",
-]
+checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df"
 
 [[package]]
 name = "unicode-bidi"
-version = "0.3.15"
+version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
+checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.12"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
 dependencies = [
  "tinyvec",
 ]
@@ -4416,15 +5018,26 @@ dependencies = [
 
 [[package]]
 name = "unicode-segmentation"
-version = "1.11.0"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
+
+[[package]]
+name = "unicode-truncate"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
+checksum = "b3644627a5af5fa321c95b9b235a72fd24cd29c648c2c379431e6628655627bf"
+dependencies = [
+ "itertools 0.13.0",
+ "unicode-segmentation",
+ "unicode-width",
+]
 
 [[package]]
 name = "unicode-width"
-version = "0.1.13"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
+checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
 
 [[package]]
 name = "unicode_categories"
@@ -4432,6 +5045,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
 
+[[package]]
+name = "unindent"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
+
 [[package]]
 name = "untrusted"
 version = "0.7.1"
@@ -4493,7 +5112,7 @@ version = "4.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c5afb1a60e207dca502682537fefcfd9921e71d0b83e9576060f09abc6efab23"
 dependencies = [
- "indexmap 2.2.6",
+ "indexmap 2.6.0",
  "serde",
  "serde_json",
  "utoipa-gen",
@@ -4501,15 +5120,15 @@ dependencies = [
 
 [[package]]
 name = "utoipa-gen"
-version = "4.3.0"
+version = "4.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7bf0e16c02bc4bf5322ab65f10ab1149bdbcaa782cba66dc7057370a3f8190be"
+checksum = "20c24e8ab68ff9ee746aad22d39b5535601e6416d1b0feeabf78be986a5c4392"
 dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
  "regex",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -4518,7 +5137,7 @@ version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b39868d43c011961e04b41623e050aedf2cc93652562ff7935ce0f819aaf2da"
 dependencies = [
- "axum 0.7.5",
+ "axum 0.7.7",
  "mime_guess",
  "regex",
  "rust-embed",
@@ -4530,9 +5149,25 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.9.1"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439"
+checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a"
+dependencies = [
+ "getrandom",
+ "rand",
+ "uuid-macro-internal",
+]
+
+[[package]]
+name = "uuid-macro-internal"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b91f57fe13a38d0ce9e28a03463d8d3c2468ed03d75375110ec71d93b449a08"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.85",
+]
 
 [[package]]
 name = "v_frame"
@@ -4559,9 +5194,9 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
 
 [[package]]
 name = "vergen"
-version = "8.3.1"
+version = "8.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e27d6bdd219887a9eadd19e1c34f32e47fa332301184935c6d9bca26f3cca525"
+checksum = "2990d9ea5967266ea0ccf413a4aa5c42a93dbcfda9cb49a97de6931726b12566"
 dependencies = [
  "anyhow",
  "cargo_metadata",
@@ -4581,9 +5216,9 @@ checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b"
 
 [[package]]
 name = "version_check"
-version = "0.9.4"
+version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
 
 [[package]]
 name = "walkdir"
@@ -4612,34 +5247,35 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.92"
+version = "0.2.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e"
 dependencies = [
  "cfg-if",
+ "once_cell",
  "wasm-bindgen-macro",
 ]
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.92"
+version = "0.2.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358"
 dependencies = [
  "bumpalo",
  "log",
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.42"
+version = "0.4.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
+checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b"
 dependencies = [
  "cfg-if",
  "js-sys",
@@ -4649,9 +5285,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.92"
+version = "0.2.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -4659,28 +5295,28 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.92"
+version = "0.2.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.92"
+version = "0.2.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d"
 
 [[package]]
 name = "web-sys"
-version = "0.3.69"
+version = "0.3.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
+checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -4696,6 +5332,16 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "webpki"
 version = "0.22.4"
@@ -4708,9 +5354,9 @@ dependencies = [
 
 [[package]]
 name = "webpki-roots"
-version = "0.26.3"
+version = "0.26.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd7c23921eeb1713a4e851530e9b9756e4fb0e89978582942612524cf09f01cd"
+checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -4751,11 +5397,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
 [[package]]
 name = "winapi-util"
-version = "0.1.8"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b"
+checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
 dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -4771,7 +5417,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
 dependencies = [
  "windows-core",
- "windows-targets 0.52.5",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -4780,7 +5426,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
 dependencies = [
- "windows-targets 0.52.5",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -4807,7 +5453,16 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.5",
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -4842,18 +5497,18 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.5",
- "windows_aarch64_msvc 0.52.5",
- "windows_i686_gnu 0.52.5",
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
  "windows_i686_gnullvm",
- "windows_i686_msvc 0.52.5",
- "windows_x86_64_gnu 0.52.5",
- "windows_x86_64_gnullvm 0.52.5",
- "windows_x86_64_msvc 0.52.5",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
 ]
 
 [[package]]
@@ -4870,9 +5525,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -4888,9 +5543,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -4906,15 +5561,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
 [[package]]
 name = "windows_i686_gnullvm"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -4930,9 +5585,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -4948,9 +5603,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -4966,9 +5621,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -4984,15 +5639,15 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.52.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "winnow"
-version = "0.6.13"
+version = "0.6.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
+checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b"
 dependencies = [
  "memchr",
 ]
@@ -5009,22 +5664,23 @@ dependencies = [
 
 [[package]]
 name = "zerocopy"
-version = "0.7.34"
+version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae87e3fcd617500e5d106f0380cf7b77f3c6092aae37191433159dda23cfb087"
+checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
+ "byteorder",
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.7.34"
+version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b"
+checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.85",
 ]
 
 [[package]]
@@ -5032,20 +5688,6 @@ name = "zeroize"
 version = "1.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
-dependencies = [
- "zeroize_derive",
-]
-
-[[package]]
-name = "zeroize_derive"
-version = "1.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.68",
-]
 
 [[package]]
 name = "zip"
@@ -5076,9 +5718,9 @@ dependencies = [
 
 [[package]]
 name = "zune-jpeg"
-version = "0.4.11"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec866b44a2a1fd6133d363f073ca1b179f438f99e7e5bfb1e33f7181facfe448"
+checksum = "16099418600b4d8f028622f73ff6e3deaabdff330fb9a2a131dea781ee8b0768"
 dependencies = [
  "zune-core",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 35abe2a14a59589e0afeefb86bc366f699f5f224..07bc967e33cd108dd0a5773ee97b5c3e1148dde2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,23 +1,39 @@
 [workspace]
 members = [
-    "benchmark",
-    "router",
-    "router/client",
-    "router/grpc-metadata",
-    "launcher"
+  "benchmark",
+  "backends/v2",
+  "backends/v3",
+  "backends/grpc-metadata",
+  "backends/trtllm",
+  "launcher",
+  "router"
+]
+default-members = [
+  "benchmark",
+  "backends/v2",
+  "backends/v3",
+  "backends/grpc-metadata",
+  # "backends/trtllm",
+  "launcher",
+  "router"
 ]
 resolver = "2"
 
 [workspace.package]
-version = "2.1.1"
+version = "2.4.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
 
 [workspace.dependencies]
 base64 = "0.22.0"
-tokenizers = { version = "0.19.1", features = ["http"] }
+tokenizers = { version = "0.20.0", features = ["http"] }
 hf-hub = { version = "0.3.1", features = ["tokio"] }
+metrics = { version = "0.23.0" }
+metrics-exporter-prometheus = { version = "0.15.1", features = [] }
+minijinja = { version = "2.2.0", features = ["json"] }
+minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+pyo3 = { version = "0.22.2", features = ["auto-initialize"] }
 
 [profile.release]
 incremental = true
diff --git a/Dockerfile b/Dockerfile
index d4772b4a7221238cafbec05d278fa92fc3bb3c78..d4189c9f68dccda75ada2bdfa3dd164485287a1c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.80.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -11,11 +11,15 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
+
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
 
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -28,22 +32,26 @@ RUN cargo chef cook --profile release-opt --recipe-path recipe.json
 ARG GIT_SHA
 ARG DOCKER_LABEL
 
+COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
-RUN cargo build --profile release-opt
+RUN cargo build --profile release-opt --frozen
 
 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
 
-ARG PYTORCH_VERSION=2.3.0
-ARG PYTHON_VERSION=3.10
+# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
+ARG PYTORCH_VERSION=2.4.0
+
+ARG PYTHON_VERSION=3.11
 # Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=12.1
+ARG CUDA_VERSION=12.4
 ARG MAMBA_VERSION=24.3.0-0
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
@@ -84,6 +92,7 @@ RUN case ${TARGETPLATFORM} in \
 FROM pytorch-install AS kernel-builder
 
 ARG MAX_JOBS=8
+ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         ninja-build cmake \
@@ -114,36 +123,29 @@ FROM kernel-builder AS exllama-kernels-builder
 WORKDIR /usr/src
 COPY server/exllama_kernels/ .
 
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN python setup.py build
 
 # Build Transformers exllama kernels
 FROM kernel-builder AS exllamav2-kernels-builder
 WORKDIR /usr/src
-COPY server/exllamav2_kernels/ .
+COPY server/Makefile-exllamav2/ Makefile
 
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN make build-exllamav2
 
 # Build Transformers awq kernels
 FROM kernel-builder AS awq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-awq Makefile
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
+RUN make build-awq
 
 # Build eetq kernels
 FROM kernel-builder AS eetq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-eetq Makefile
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
-
-# Build marlin kernels
-FROM kernel-builder AS marlin-kernels-builder
-WORKDIR /usr/src
-COPY server/marlin/ .
-# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN make build-eetq
 
 # Build Lorax Punica kernels
 FROM kernel-builder AS lorax-punica-builder
@@ -177,6 +179,12 @@ WORKDIR /usr/src
 COPY server/Makefile-selective-scan Makefile
 RUN make build-all
 
+# Build flashinfer
+FROM kernel-builder AS flashinfer-builder
+WORKDIR /usr/src
+COPY server/Makefile-flashinfer Makefile
+RUN make install-flashinfer
+
 # Text Generation Inference base image
 FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
 
@@ -185,7 +193,7 @@ ENV PATH=/opt/conda/bin:$PATH \
     CONDA_PREFIX=/opt/conda
 
 # Text Generation Inference base env
-ENV HUGGINGFACE_HUB_CACHE=/data \
+ENV HF_HOME=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     PORT=80
 
@@ -203,33 +211,31 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
 COPY --from=pytorch-install /opt/conda /opt/conda
 
 # Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-# Copy build artifacts from marlin kernels builder
-COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-
-# Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+# Copy build artifacts from lorax punica kernels builder
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+# Copy build artifacts from vllm builder
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
-COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
-COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
+COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
 
 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir
@@ -241,7 +247,15 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
     make gen-server && \
     pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
+    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    pip install nvidia-nccl-cu12==2.22.3
+
+ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
+# Required to find libpython within the rust binaries
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
+# This is needed because exl2 tries to load flash-attn
+# And fails with our builds.
+ENV EXLLAMA_NO_FLASH_ATTN=1
 
 # Deps before the binaries
 # The binaries change on every build given we burn the SHA into them
diff --git a/Dockerfile.nix b/Dockerfile.nix
new file mode 100644
index 0000000000000000000000000000000000000000..f1e7e0f553e19614eb35c08856c6aaaf891b0782
--- /dev/null
+++ b/Dockerfile.nix
@@ -0,0 +1,24 @@
+# Build the image and get out the docker file:
+#
+# docker build -t tgi-nix-builder -f Dockerfile.nix
+# docker run --log-driver=none tgi-nix-builder | docker load
+
+FROM nixos/nix:2.18.8 AS builder
+RUN echo "experimental-features = nix-command flakes" >> /etc/nix/nix.conf
+RUN nix profile install nixpkgs#cachix
+RUN cachix use text-generation-inference
+WORKDIR /root
+ADD . .
+RUN nix build .
+RUN mkdir /tmp/nix-store-closure
+RUN cp -R $(nix-store -qR result/) /tmp/nix-store-closure
+
+FROM ubuntu:24.04
+
+WORKDIR /app
+
+# Copy /nix/store
+COPY --from=builder /tmp/nix-store-closure /nix/store
+COPY --from=builder /root/result /app
+RUN ldconfig
+CMD ["ldconfig", "/app/bin/text-generation-launcher"]
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 0aebeee5747b4f55ce63ee4bcbf1fa353065366c..b84d4edd80247d2a31eadb8dfca688e1a23fd73f 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.80.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -11,11 +11,14 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
 
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -28,16 +31,18 @@ RUN cargo chef cook --profile release-opt --recipe-path recipe.json
 ARG GIT_SHA
 ARG DOCKER_LABEL
 
+COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
-RUN cargo build --profile release-opt
+RUN cargo build --profile release-opt --frozen
 
 # Text Generation Inference base image for RoCm
-FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update AS base
+FROM rocm/dev-ubuntu-22.04:6.2 AS base
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     build-essential \
@@ -46,33 +51,34 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     curl \
     git \
     make \
+    libmsgpack-dev \
     libssl-dev \
+    llvm-dev \
     g++ \
     # Needed to build VLLM & flash.
     rocthrust-dev \
     hipsparse-dev \
     hipblas-dev \
-    hipblaslt-dev \
+    hipcub-dev \
     rocblas-dev \
     hiprand-dev \
+    hipfft-dev \
     rocrand-dev \
     miopen-hip-dev \
-    hipfft-dev \
-    hipcub-dev \
     hipsolver-dev \
     rccl-dev \
     cmake \
-    python3-dev && \
+    python3.11-venv && \
     rm -rf /var/lib/apt/lists/*
 
 # Keep in sync with `server/pyproject.toml
 ARG MAMBA_VERSION=23.1.0-1
-ARG PYTORCH_VERSION='2.3.0'
-ARG ROCM_VERSION='6.0.2'
-ARG PYTHON_VERSION='3.10.10'
+ARG PYTHON_VERSION='3.11.10'
 # Automatically set by buildx
 ARG TARGETPLATFORM
-ENV PATH /opt/conda/bin:$PATH
+ENV PATH=/opt/conda/bin:$PATH
+
+ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 
 # TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
 # Install mamba
@@ -87,42 +93,141 @@ RUN chmod +x ~/mambaforge.sh && \
     mamba init && \
     rm ~/mambaforge.sh
 
-# Install flash-attention, torch dependencies
-RUN pip install numpy einops ninja --no-cache-dir
-
-RUN conda install intel::mkl-static intel::mkl-include
-RUN pip uninstall -y triton && \
-    git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
-    cd triton/python && \
-    pip install .
-
-RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir
+# RUN conda install intel::mkl-static intel::mkl-include
+# Install pytorch
+# On arm64 we exit with an error code
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
 
-ARG _GLIBCXX_USE_CXX11_ABI="1"
-ARG CMAKE_PREFIX_PATH="/opt/conda"
+# Install flash-attention, torch dependencies
+RUN python3 -m pip install --upgrade pip && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
+
+RUN conda install mkl=2021
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
+
+
+ARG COMMON_WORKDIR=/
+WORKDIR ${COMMON_WORKDIR}
+
+
+# Install HIPBLASLt
+FROM base AS build_hipblaslt
+ARG HIPBLASLT_BRANCH="e6da924"
+RUN git clone https://github.com/ROCm/hipBLASLt.git \
+    && cd hipBLASLt \
+    && git checkout ${HIPBLASLT_BRANCH} \
+    && SCCACHE_IDLE_TIMEOUT=1800 ./install.sh --architecture ${PYTORCH_ROCM_ARCH} --legacy_hipblas_direct \
+    && cd build/release \
+    && make package
+
+FROM scratch AS export_hipblaslt
+ARG COMMON_WORKDIR
+COPY --from=build_hipblaslt ${COMMON_WORKDIR}/hipBLASLt/build/release/*.deb /
+
+# RCCL build stages
+FROM base AS build_rccl
+ARG RCCL_BRANCH="rocm-6.2.0"
+RUN git clone https://github.com/ROCm/rccl \
+    && cd rccl \
+    && git checkout ${RCCL_BRANCH} \
+    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
+FROM scratch AS export_rccl
+ARG COMMON_WORKDIR
+COPY --from=build_rccl ${COMMON_WORKDIR}/rccl/build/release/*.deb /
+
+# Triton build stages
+FROM base AS build_triton
+ARG TRITON_BRANCH="e192dba"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+RUN python3 -m pip install ninja cmake wheel pybind11 && git clone ${TRITON_REPO} \
+    && cd triton \
+    && git checkout ${TRITON_BRANCH} \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+FROM scratch AS export_triton
+ARG COMMON_WORKDIR
+COPY --from=build_triton ${COMMON_WORKDIR}/triton/python/dist/*.whl /
+
+# # AMD-SMI build stages
+FROM base AS build_amdsmi
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=dist
+FROM scratch AS export_amdsmi
+COPY --from=build_amdsmi /opt/rocm/share/amd_smi/dist/*.whl /
+
+
+FROM base as build_pytorch
+
+RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
+    if ls /install/*.deb; then \
+        dpkg -i /install/*.deb \
+        && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+        && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
+    fi
+
+ARG BUILD_ENVIRONMENT=pytorch-linux-jammy-rocm6.2-py3.11
 ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-ARG BUILD_CAFFE2="0" \
-    BUILD_CAFFE2_OPS="0" \
-    USE_CUDA="0" \
-    USE_ROCM="1" \
-    BUILD_TEST="0" \
-    USE_FBGEMM="0" \
-    USE_NNPACK="0" \
-    USE_QNNPACK="0" \
-    USE_XNNPACK="0" \
-    USE_FLASH_ATTENTION="1" \
-    USE_MEM_EFF_ATTENTION="0"
-
-RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install
 
-# Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
-ENV HIP_FORCE_DEV_KERNARG=1
-
-# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
-# However, Triton requires a tunning for each prompt length, which is prohibitive.
-ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
-
-FROM base AS kernel-builder
+# A commit to fix the output scaling factor issue in _scaled_mm
+# Not yet in 2.5.0-rc1
+ARG PYTORCH_BRANCH="cedc116"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
+
+RUN git clone ${PYTORCH_REPO} pytorch \
+    && cd pytorch && git checkout ${PYTORCH_BRANCH} && git submodule update --init --recursive \
+    && pip install -r requirements.txt --no-cache-dir  \
+    && python tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist
+FROM scratch as export_pytorch
+ARG COMMON_WORKDIR
+COPY --from=build_pytorch ${COMMON_WORKDIR}/pytorch/dist/*.whl /
+
+FROM base AS install_deps
+
+ARG COMMON_WORKDIR
+
+# Install hipblaslt
+RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
+    if ls /install/*.deb; then \
+        dpkg -i /install/*.deb \
+        && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+        && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
+    fi
+
+RUN --mount=type=bind,from=export_rccl,src=/,target=/install \
+    if ls /install/*.deb; then \
+        dpkg -i /install/*.deb \
+        # RCCL needs to be installed twice
+        && dpkg -i /install/*.deb \
+        && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+        && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status; \
+    fi
+
+RUN --mount=type=bind,from=export_triton,src=/,target=/install \
+    if ls /install/*.whl; then \
+        # Preemptively uninstall to prevent pip same-version no-installs
+        pip uninstall -y triton \
+        && pip install /install/*.whl; \
+    fi
+
+RUN --mount=type=bind,from=export_amdsmi,src=/,target=/install \
+    # Preemptively uninstall to prevent pip same-version no-installs
+    pip uninstall -y amdsmi \
+    && pip install /install/*.whl;
+
+RUN --mount=type=bind,from=export_pytorch,src=/,target=/install \
+    if ls /install/*.whl; then \
+        # Preemptively uninstall to prevent pip same-version no-installs
+        pip uninstall -y torch torchvision \
+        && pip install /install/*.whl; \
+    fi
+
+FROM install_deps AS kernel-builder
 
 # # Build vllm kernels
 FROM kernel-builder AS vllm-builder
@@ -162,27 +267,27 @@ COPY server/exllamav2_kernels/ .
 
 RUN python setup.py build
 
-FROM base AS base-copy
+FROM install_deps AS base-copy
 
 # Text Generation Inference base env
-ENV HUGGINGFACE_HUB_CACHE=/data \
+ENV HF_HOME=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     PORT=80
 
 # Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Install server
 COPY proto proto
@@ -199,6 +304,7 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/l
 COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
 # Install launcher
 COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 
 # AWS Sagemaker compatible image
 FROM base AS sagemaker
@@ -211,6 +317,20 @@ ENTRYPOINT ["./entrypoint.sh"]
 # Final image
 FROM base-copy
 
+# Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
+ENV HIP_FORCE_DEV_KERNARG=1
+
+# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
+# However, Triton requires a tunning for each prompt length, which is prohibitive.
+ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
+ENV ROCM_USE_CUSTOM_PAGED_ATTN=1
+ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0
+ENV VLLM_MOE_PADDING=0
+ENV ATTENTION=paged
+ENV PREFIX_CACHING=0
+ENV PREFILL_CHUNKING=0
+ENV ROCM_USE_SKINNY_GEMM=1
+
 COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
 RUN chmod +x /tgi-entrypoint.sh
 
diff --git a/Dockerfile_dcu b/Dockerfile_dcu
deleted file mode 100644
index e0d1d2c36f0eede67f6d24acfeabd642ede780a7..0000000000000000000000000000000000000000
--- a/Dockerfile_dcu
+++ /dev/null
@@ -1,130 +0,0 @@
-# Rust builder
-FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310 as chef
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-ENV PATH /root/.cargo/bin:$PATH
-RUN cargo install cargo-chef
-WORKDIR /usr/src
-
-ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
-
-FROM chef as planner
-COPY Cargo.toml Cargo.toml
-COPY Cargo.lock Cargo.lock
-COPY rust-toolchain.toml rust-toolchain.toml
-COPY proto proto
-COPY benchmark benchmark
-COPY router router
-COPY launcher launcher
-RUN cargo chef prepare --recipe-path recipe.json
-
-FROM chef AS builder
-
-ARG GIT_SHA
-ARG DOCKER_LABEL
-
-RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
-    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
-    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
-    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
-    rm -f $PROTOC_ZIP
-COPY --from=planner /usr/src/recipe.json recipe.json
-RUN cargo chef cook --release --recipe-path recipe.json
-
-COPY Cargo.toml Cargo.toml
-COPY Cargo.lock Cargo.lock
-COPY rust-toolchain.toml rust-toolchain.toml
-COPY proto proto
-COPY benchmark benchmark
-COPY router router
-COPY launcher launcher
-RUN cargo build --release
-
-# Text Generation Inference base image for RoCm
-FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310 as base
-# Need hyhal while compiling
-WORKDIR /opt
-RUN wget https://cancon.hpccube.com:65024/directlink/1/DTK-23.10.1/hyhal.tar.gz && \
-    tar -xzf hyhal.tar.gz -C /opt
-
-ENV LD_LIBRARY_PATH /opt/hyhal/lib:/opt/hyhal/lib64:$LD_LIBRARY_PATH
-ENV PYTHONPATH /usr/local/lib/python3.10/site-packages:$PYTHONPATH
-
-FROM base AS kernel-builder
-
-# Build vllm kernels
-FROM kernel-builder AS vllm-builder
-WORKDIR /usr/src
-COPY server/vllm/ . 
-
-# Build specific version of vllm
-RUN python setup.py build
-
-# Build Transformers CUDA kernels (gpt-neox and bloom)
-FROM kernel-builder as custom-kernels-builder
-WORKDIR /usr/src
-COPY server/custom_kernels/ .
-RUN python setup.py build
-
-# Build exllama kernels
-FROM kernel-builder as exllama-kernels-builder
-WORKDIR /usr/src
-COPY server/exllama_kernels/ .
-
-RUN python setup.py build
-
-# Build exllama v2 kernels
-FROM kernel-builder as exllamav2-kernels-builder
-WORKDIR /usr/src
-COPY server/exllamav2_kernels/ .
-
-RUN python setup.py build
-
-FROM base as base-copy
-
-# uninstall exist vllm in base docker image
-RUN pip uninstall -y vllm
-
-# Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/build/lib.linux-x86_64-cpython-310 /usr/local/lib/python3.10/site-packages
-
-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /usr/local/lib/python3.10/site-packages
-
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /usr/local/lib/python3.10/site-packages
-
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /usr/local/lib/python3.10/site-packages
-
-# Install server
-RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
-COPY proto proto
-COPY server server
-COPY server/Makefile server/Makefile
-RUN cd server && \
-    make gen-server && \
-    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
-
-# Install benchmarker
-COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
-# Install router
-COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
-# Install launcher
-COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
-
-#Remove default hyhal
-RUN rm -rf /opt/hyhal /opt/hyhal.tar.gz
-
-# AWS Sagemaker compatible image
-# FROM base-copy as sagemaker
-# COPY sagemaker-entrypoint.sh entrypoint.sh
-# RUN chmod +x entrypoint.sh
-
-# ENTRYPOINT ["./entrypoint.sh"]
-
-# # Final image
-# FROM base-copy
-
-# ENTRYPOINT ["text-generation-launcher"]
-# CMD ["--json-output"]
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 6a803a32bacd0bf5a6fa8f9ffeabfe8506ff7fc2..96f242489abe7a72c898d3678179576fcfa2138e 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -1,6 +1,6 @@
 ARG PLATFORM=xpu
 
-FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.80.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -12,11 +12,14 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
 
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -29,20 +32,48 @@ RUN cargo chef cook --profile release-opt --recipe-path recipe.json
 ARG GIT_SHA
 ARG DOCKER_LABEL
 
+COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
-RUN cargo build --profile release-opt
+RUN cargo build --profile release-opt --frozen
 
 
 # Text Generation Inference base image for Intel
 
-FROM intel/intel-extension-for-pytorch:2.1.30-xpu AS xpu
+FROM intel/intel-extension-for-pytorch:2.3.110-xpu AS xpu
 
 USER root
+
+ARG MAMBA_VERSION=23.1.0-1
+ARG PYTHON_VERSION='3.11.10'
+# Automatically set by buildx
+ARG TARGETPLATFORM
+ENV PATH=/opt/conda/bin:$PATH
+
+# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
+# Install mamba
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
 # libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
 RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
     dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
@@ -52,18 +83,16 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
 
-RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
 
 # Text Generation Inference base env
-ENV HUGGINGFACE_HUB_CACHE=/data \
+ENV HF_HOME=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     PORT=80
 
 
 WORKDIR /usr/src
-RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl && pip install torch-2.1.0.post1+cxx11.abi-cp310-cp310-linux_x86_64.whl
-RUN pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b distributed origin/dev/distributed
+RUN pip install torch==2.3.1+cxx11.abi torchvision==0.18.1+cxx11.abi torchaudio==2.3.1+cxx11.abi intel-extension-for-pytorch==2.3.110+xpu oneccl_bind_pt==2.3.100+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --no-cache-dir
 
 # Install server
 COPY proto proto
@@ -78,13 +107,13 @@ ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
 ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
 ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
-ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
-ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:/opt/conda/lib
+ENV PATH=/opt/conda/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV CCL_ZE_IPC_EXCHANGE=sockets
 ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
 ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
-
-RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
+ENV TORCH_LLM_ALLREDUCE=1
+ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
@@ -101,17 +130,28 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     curl \
     ca-certificates \
     make \
-    g++ \
+    g++-12 \
+    gcc-12 \
     git \
     wget \
-    cmake
+    cmake \
+    libnuma-dev
+
+RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 12
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+RUN update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30
+RUN update-alternatives --set cc /usr/bin/gcc
+
+RUN update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30
+RUN update-alternatives --set c++ /usr/bin/g++
+
 
 ENV HUGGINGFACE_HUB_CACHE=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     PORT=80
 
 ARG MAMBA_VERSION=23.1.0-1
-ARG PYTHON_VERSION='3.10.10'
+ARG PYTHON_VERSION='3.11.10'
 # Automatically set by buildx
 ARG TARGETPLATFORM
 ENV PATH /opt/conda/bin:$PATH
@@ -128,33 +168,37 @@ RUN chmod +x ~/mambaforge.sh && \
     bash ~/mambaforge.sh -b -p /opt/conda && \
     rm ~/mambaforge.sh
 
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
 RUN conda install -c conda-forge gperftools mkl
 
-RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
-RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
-RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
-RUN pip install triton
 
-WORKDIR /usr/src
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.5.0.dev20240815%2Bcpu-cp311-cp311-linux_x86_64.whl
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.20.0.dev20240815%2Bcpu-cp311-cp311-linux_x86_64.whl
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240815%2Bcpu-cp311-cp311-linux_x86_64.whl
 
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout eda7a7c42df6f9a64e0de9c2b69304ee02f2c32a
+RUN pip install triton py-libnuma
+
+WORKDIR /usr/src
 
-RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout ccl_torch_dev_0131
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout f86e93e4890dc2c989024d148d415c9aa8a1649f
+RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout v2.4.0+cpu+rc0
 
 RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update --init --recursive && python setup.py install
 
 RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
 
-ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so:/opt/conda/lib/libiomp5.so
-ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
-ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
-ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
-ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
-ENV KMP_BLOCKTIME=1
-ENV KMP_TPAUSE=0
-ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
-ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist
-ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist
+ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so
+ENV CCL_ROOT=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch
+ENV I_MPI_ROOT=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch
+ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
+ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/lib
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 
 # Install server
 COPY proto proto
@@ -173,5 +217,9 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/loca
 COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
 
 FROM ${PLATFORM} AS final
+ENV ATTENTION=paged
+ENV PREFIX_CACHING=0
+ENV PREFILL_CHUNKING=0
+ENV CUDA_GRAPHS=0
 ENTRYPOINT ["text-generation-launcher"]
 CMD ["--json-output"]
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
new file mode 100644
index 0000000000000000000000000000000000000000..3ccb0310bea77eb9f8c8f60a982e46753e66847e
--- /dev/null
+++ b/Dockerfile_trtllm
@@ -0,0 +1,108 @@
+ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
+ARG OMPI_VERSION="4.1.6"
+
+# Build dependencies resolver stage
+FROM lukemathwalker/cargo-chef:latest AS chef
+WORKDIR /usr/src/text-generation-inference/backends/trtllm
+
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+# CUDA dependent dependencies resolver stage
+FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && apt install -y \
+    build-essential \
+    cmake \
+    curl \
+    gcc  \
+    g++ \
+    git \
+    git-lfs \
+    libssl-dev \
+    ninja-build \
+    pkg-config \
+    python3 \
+    python3-dev \
+    python3-setuptools \
+    tar \
+    wget
+
+ENV TGI_INSTALL_PREFIX=/usr/local/tgi
+ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
+
+# Install OpenMPI
+FROM cuda-builder AS mpi-builder
+ARG OMPI_VERSION
+
+ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
+RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
+    mkdir /usr/src/mpi && \
+    tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
+    cd /usr/src/mpi && \
+    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
+    make -j all && \
+    make install && \
+    rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
+
+# Install TensorRT
+FROM cuda-builder AS trt-builder
+COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
+RUN chmod +x /opt/install_tensorrt.sh && \
+    /opt/install_tensorrt.sh
+
+# Build Backend
+FROM cuda-builder AS tgi-builder
+WORKDIR /usr/src/text-generation-inference
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
+    chmod -R a+w /root/.rustup && \
+    chmod -R a+w /root/.cargo
+
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef
+
+# Cache dependencies
+COPY --from=planner /usr/src/text-generation-inference/backends/trtllm/recipe.json .
+RUN cargo chef cook --release --recipe-path recipe.json
+
+# Build actual TGI
+ARG CUDA_ARCH_LIST
+ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
+ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
+ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
+
+COPY . .
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
+    cd backends/trtllm && \
+    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
+
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
+RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
+    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
+    python3 -m pip install transformers tokenizers
+
+WORKDIR /usr/local/tgi/bin
+
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV TOKENIZERS_PARALLELISM=false
+ENV OMPI_MCA_plm_rsh_agent=""
+
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
+COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
+
+FROM runtime
+
+LABEL co.huggingface.vendor="Hugging Face Inc."
+LABEL org.opencontainers.image.authors="hardware@hf.co"
+
+ENTRYPOINT ["./text-generation-launcher"]
+CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
diff --git a/Makefile b/Makefile
index a434d4f4ac8b150512fcdeb94f470a18e965adeb..3068a06f41b80be82e81ab44b768e5a113828a57 100644
--- a/Makefile
+++ b/Makefile
@@ -5,13 +5,13 @@ install-server-cpu:
 	cd server && make install-server
 
 install-router:
-	cd router && cargo install --path . --debug
+	cargo install --path backends/v3/
 
 install-launcher:
-	cd launcher && cargo install --path .
+	cargo install --path launcher/
 
 install-benchmark:
-	cd benchmark && cargo install --path .
+	cargo install --path benchmark/
 
 install: install-server install-router install-launcher
 
diff --git a/README.md b/README.md
index a4df89802d73852c50300077b902b63a107be20c..7ab00190203e9a77ab124b061ca6bc50fe62eccc 100644
--- a/README.md
+++ b/README.md
@@ -1,47 +1,214 @@
- <div align="center"><strong>Text Generation Inference </strong></div>
+<div align="center">
 
-## 简介
-Text Generation Inference（TGI）是一个用 Rust 和 Python 编写的框架，用于部署和提供LLM模型的推理服务。TGI为很多大模型提供了高性能的推理服务，如LLama,Falcon,BLOOM,Baichuan,Qwen等。
+<a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
+  <img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
+</a>
 
-## 支持模型结构列表
-|     模型      | 模型并行 | FP16 |
-| :----------: | :------: | :--: |
-|    LLaMA          |   Yes    | Yes  |
-|    LLaMA-2        |   Yes    | Yes  |
-|    LLaMA-2-GPTQ        |   Yes    | Yes  |
-|    LLaMA-3        |   Yes    | Yes  |
-|    Codellama      |   Yes    | Yes  |
-|    QWen2          |   Yes    | Yes  |
-|    QWen2-GPTQ        |   Yes    | Yes  |
-|    Baichuan-7B    |   Yes    | Yes  |
-|    Baichuan2-7B   |   Yes    | Yes  |
-|    Baichuan2-13B  |   Yes    | Yes  |
+# Text Generation Inference
 
+<a href="https://github.com/huggingface/text-generation-inference">
+  <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/huggingface/text-generation-inference?style=social">
+</a>
+<a href="https://huggingface.github.io/text-generation-inference">
+  <img alt="Swagger API documentation" src="https://img.shields.io/badge/API-Swagger-informational">
+</a>
 
-## 环境要求
-+ Python 3.10
-+ DTK 24.04.2
-+ torch 2.1.0
+A Rust, Python and gRPC server for text generation inference. Used in production at [Hugging Face](https://huggingface.co)
+to power Hugging Chat, the Inference API and Inference Endpoint.
 
-### 使用源码编译方式安装
+</div>
 
-#### 编译环境准备
+## Table of contents
 
-有两种方式安装准备环境
-##### 方式一:
+  - [Get Started](#get-started)
+    - [Docker](#docker)
+    - [API documentation](#api-documentation)
+    - [Using a private or gated model](#using-a-private-or-gated-model)
+    - [A note on Shared Memory (shm)](#a-note-on-shared-memory-shm)
+    - [Distributed Tracing](#distributed-tracing)
+    - [Architecture](#architecture)
+    - [Local install](#local-install)
+  - [Optimized architectures](#optimized-architectures)
+  - [Run locally](#run-locally)
+    - [Run](#run)
+    - [Quantization](#quantization)
+  - [Develop](#develop)
+  - [Testing](#testing)
 
-### **TODO**
+Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and [more](https://huggingface.co/docs/text-generation-inference/supported_models). TGI implements many features, such as:
 
-##### 方式二：
+- Simple launcher to serve most popular LLMs
+- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
+- Tensor Parallelism for faster inference on multiple GPUs
+- Token streaming using Server-Sent Events (SSE)
+- Continuous batching of incoming requests for increased total throughput
+- [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) compatible with Open AI Chat Completion API
+- Optimized transformers code for inference using [Flash Attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
+- Quantization with :
+  - [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+  - [GPT-Q](https://arxiv.org/abs/2210.17323)
+  - [EETQ](https://github.com/NetEase-FuXi/EETQ)
+  - [AWQ](https://github.com/casper-hansen/AutoAWQ)
+  - [Marlin](https://github.com/IST-DASLab/marlin)
+  - [fp8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/)
+- [Safetensors](https://github.com/huggingface/safetensors) weight loading
+- Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+- Logits warper (temperature scaling, top-p, top-k, repetition penalty, more details see [transformers.LogitsProcessor](https://huggingface.co/docs/transformers/internal/generation_utils#transformers.LogitsProcessor))
+- Stop sequences
+- Log probabilities
+- [Speculation](https://huggingface.co/docs/text-generation-inference/conceptual/speculation) ~2x latency
+- [Guidance/JSON](https://huggingface.co/docs/text-generation-inference/conceptual/guidance). Specify output format to speed up inference and make sure the output is valid according to some specs..
+- Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output
+- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance
 
-基于光源pytorch2.1.0基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。pytorch2.1.0镜像里已经安装了trition,flash-attn
+### Hardware support
+
+- [Nvidia](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference)
+- [AMD](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference) (-rocm)
+- [Inferentia](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference)
+- [Intel GPU](https://github.com/huggingface/text-generation-inference/pull/1475)
+- [Gaudi](https://github.com/huggingface/tgi-gaudi)
+- [Google TPU](https://huggingface.co/docs/optimum-tpu/howto/serving)
+
+
+## Get Started
+
+### Docker
+
+For a detailed starting guide, please see the [Quick Tour](https://huggingface.co/docs/text-generation-inference/quicktour). The easiest way of getting started is using the official Docker container:
+
+```shell
+model=HuggingFaceH4/zephyr-7b-beta
+# share a volume with the Docker container to avoid downloading weights every run
+volume=$PWD/data
+
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model
+```
+
+And then you can make requests like
+
+```bash
+curl 127.0.0.1:8080/generate_stream \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```
+
+You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses.
+
+```bash
+curl localhost:8080/v1/chat/completions \
+    -X POST \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
+    -H 'Content-Type: application/json'
+```
+
+**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
+
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0-rocm --model-id $model` instead of the command above.
+
+To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
+```
+text-generation-launcher --help
+```
+
+### API documentation
+
+You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route.
+The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).
+
+### Using a private or gated model
+
+You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
+`text-generation-inference`. This allows you to gain access to protected resources.
+
+For example, if you want to serve the gated Llama V2 model variants:
+
+1. Go to https://huggingface.co/settings/tokens
+2. Copy your cli READ token
+3. Export `HF_TOKEN=<your cli READ token>`
+
+or with Docker:
+
+```shell
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+token=<your cli READ token>
+
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model
+```
+
+### A note on Shared Memory (shm)
+
+[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
+`PyTorch` to do distributed training/inference. `text-generation-inference` make
+use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
+
+In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if
+peer-to-peer using NVLink or PCI is not possible.
+
+To allow the container to use 1G of Shared Memory and support SHM sharing, we add `--shm-size 1g` on the above command.
+
+If you are running `text-generation-inference` inside `Kubernetes`. You can also add Shared Memory to the container by
+creating a volume with:
+
+```yaml
+- name: shm
+  emptyDir:
+   medium: Memory
+   sizeLimit: 1Gi
+```
+
+and mounting it to `/dev/shm`.
+
+Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
+this will impact performance.
+
+### Distributed Tracing
+
+`text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
+by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be
+overridden with the `--otlp-service-name` argument
+
+### Architecture
+
+![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
+
+Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with TGI (Martin Iglesias Goyanes - Adyen, 2024)](https://www.adyen.com/knowledge-hub/llm-inference-at-scale-with-tgi)
+
+### Local install
+
+You can also opt to install `text-generation-inference` locally.
+
+First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
+Python 3.9, e.g. using `conda`:
 
-1. 安装Rust
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+
+conda create -n text-generation-inference python=3.11
+conda activate text-generation-inference
 ```
 
-2. 安装Protoc
+You may also need to install Protoc.
+
+On Linux:
+
 ```shell
 PROTOC_ZIP=protoc-21.12-linux-x86_64.zip
 curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP
@@ -49,50 +216,77 @@ sudo unzip -o $PROTOC_ZIP -d /usr/local bin/protoc
 sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
 rm -f $PROTOC_ZIP
 ```
-3. 安装TGI Service
-```bash
-git clone http://developer.hpccube.com/codes/wangkx1/text_generation_server-dcu.git # 根据需要的分支进行切换
 
-cd text-generation-inference
-pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
-pip install -r pre_requirements.txt
+On MacOS, using Homebrew:
 
-#安装exllama
-cd server
-make install-exllama #安装exllama kernels
-make install-exllamav2 #安装exllmav2 kernels
+```shell
+brew install protobuf
+```
 
-cd .. #回到项目根目录
-source $HOME/.cargo/env
-BUILD_EXTENSIONS=True make install #安装text-generation服务
+Then run:
 
+```shell
+BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork with CUDA kernels
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 ```
-4. 安装benchmark
-```bash
-cd text-generation-inference
-make install-benchmark
-```
-注意：若安装过程过慢，可以通过如下命令修改默认源提速。
-```bash
 
+**Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:
+
+```shell
+sudo apt-get install libssl-dev gcc -y
 ```
-另外，`cargo install` 太慢也可以通过在`~/.cargo/config`中添加源来提速。
 
-## 查看安装的版本号
-```bash
-text-generation-launcher -V  #版本号与官方版本同步
+## Optimized architectures
+
+TGI works out of the box to serve optimized models for all modern models. They can be found in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).
+
+Other architectures are supported on a best-effort basis using:
+
+`AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
+
+or
+
+`AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`
+
+
+
+## Run locally
+
+### Run
+
+```shell
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 ```
 
-## 使用前
+### Quantization
 
-```bash
-export PYTORCH_TUNABLEOP_ENABLED=0
+You can also run pre-quantized weights (AWQ, GPTQ, Marlin) or on-the-fly quantize weights with bitsandbytes, EETQ, fp8, to reduce the VRAM requirement:
+
+```shell
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize
 ```
 
-## Known Issue
+4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.
+
+Read more about quantization in the [Quantization documentation](https://huggingface.co/docs/text-generation-inference/en/conceptual/quantization).
+
+## Develop
 
-- 无
+```shell
+make server-dev
+make router-dev
+```
+
+## Testing
 
-## 参考资料
-- [README_ORIGIN](README_ORIGIN.md)
-- [https://github.com/huggingface/text-generation-inference](https://github.com/huggingface/text-generation-inference)
+```shell
+# python
+make python-server-tests
+make python-client-tests
+# or both server and client tests
+make python-tests
+# rust cargo tests
+make rust-tests
+# integration tests
+make integration-tests
+```
diff --git a/README_ORINGIN.md b/README_ORINGIN.md
deleted file mode 100644
index 74616748efa88a47785eaff7c1952370aaad20ea..0000000000000000000000000000000000000000
--- a/README_ORINGIN.md
+++ /dev/null
@@ -1,258 +0,0 @@
-<div align="center">
-
-<a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
-  <img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
-</a>
-
-# Text Generation Inference
-
-<a href="https://github.com/huggingface/text-generation-inference">
-  <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/huggingface/text-generation-inference?style=social">
-</a>
-<a href="https://huggingface.github.io/text-generation-inference">
-  <img alt="Swagger API documentation" src="https://img.shields.io/badge/API-Swagger-informational">
-</a>
-
-A Rust, Python and gRPC server for text generation inference. Used in production at [HuggingFace](https://huggingface.co)
-to power Hugging Chat, the Inference API and Inference Endpoint.
-
-</div>
-
-## Table of contents
-
-- [Get Started](#get-started)
-  - [API Documentation](#api-documentation)
-  - [Using a private or gated model](#using-a-private-or-gated-model)
-  - [A note on Shared Memory](#a-note-on-shared-memory-shm)
-  - [Distributed Tracing](#distributed-tracing)
-  - [Local Install](#local-install)
-  - [CUDA Kernels](#cuda-kernels)
-- [Optimized architectures](#optimized-architectures)
-- [Run Mistral](#run-a-model)
-  - [Run](#run)
-  - [Quantization](#quantization)
-- [Develop](#develop)
-- [Testing](#testing)
-
-Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and [more](https://huggingface.co/docs/text-generation-inference/supported_models). TGI implements many features, such as:
-
-- Simple launcher to serve most popular LLMs
-- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
-- Tensor Parallelism for faster inference on multiple GPUs
-- Token streaming using Server-Sent Events (SSE)
-- Continuous batching of incoming requests for increased total throughput
-- Optimized transformers code for inference using [Flash Attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
-- Quantization with :
-  - [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
-  - [GPT-Q](https://arxiv.org/abs/2210.17323)
-  - [EETQ](https://github.com/NetEase-FuXi/EETQ)
-  - [AWQ](https://github.com/casper-hansen/AutoAWQ)
-- [Safetensors](https://github.com/huggingface/safetensors) weight loading
-- Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
-- Logits warper (temperature scaling, top-p, top-k, repetition penalty, more details see [transformers.LogitsProcessor](https://huggingface.co/docs/transformers/internal/generation_utils#transformers.LogitsProcessor))
-- Stop sequences
-- Log probabilities
-- [Speculation](https://huggingface.co/docs/text-generation-inference/conceptual/speculation) ~2x latency
-- [Guidance/JSON](https://huggingface.co/docs/text-generation-inference/conceptual/guidance). Specify output format to speed up inference and make sure the output is valid according to some specs..
-- Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output
-- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance
-
-### Hardware support
-
-- [Nvidia](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference)
-- [AMD](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference) (-rocm)
-- [Inferentia](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference)
-- [Intel GPU](https://github.com/huggingface/text-generation-inference/pull/1475)
-- [Gaudi](https://github.com/huggingface/tgi-gaudi)
-- [Google TPU](https://huggingface.co/docs/optimum-tpu/howto/serving)
-
-
-## Get Started
-
-### Docker
-
-For a detailed starting guide, please see the [Quick Tour](https://huggingface.co/docs/text-generation-inference/quicktour). The easiest way of getting started is using the official Docker container:
-
-```shell
-model=HuggingFaceH4/zephyr-7b-beta
-volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
-
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
-```
-
-And then you can make requests like
-
-```bash
-curl 127.0.0.1:8080/generate_stream \
-    -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
-    -H 'Content-Type: application/json'
-```
-
-**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
-
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0-rocm --model-id $model` instead of the command above.
-
-To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
-```
-text-generation-launcher --help
-```
-
-### API documentation
-
-You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route.
-The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).
-
-### Using a private or gated model
-
-You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
-`text-generation-inference`. This allows you to gain access to protected resources.
-
-For example, if you want to serve the gated Llama V2 model variants:
-
-1. Go to https://huggingface.co/settings/tokens
-2. Copy your cli READ token
-3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
-
-or with Docker:
-
-```shell
-model=meta-llama/Llama-2-7b-chat-hf
-volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
-token=<your cli READ token>
-
-docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
-```
-
-### A note on Shared Memory (shm)
-
-[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
-`PyTorch` to do distributed training/inference. `text-generation-inference` make
-use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
-
-In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if
-peer-to-peer using NVLink or PCI is not possible.
-
-To allow the container to use 1G of Shared Memory and support SHM sharing, we add `--shm-size 1g` on the above command.
-
-If you are running `text-generation-inference` inside `Kubernetes`. You can also add Shared Memory to the container by
-creating a volume with:
-
-```yaml
-- name: shm
-  emptyDir:
-   medium: Memory
-   sizeLimit: 1Gi
-```
-
-and mounting it to `/dev/shm`.
-
-Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
-this will impact performance.
-
-### Distributed Tracing
-
-`text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
-by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
-
-### Architecture
-
-![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
-
-### Local install
-
-You can also opt to install `text-generation-inference` locally.
-
-First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
-Python 3.9, e.g. using `conda`:
-
-```shell
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
-
-conda create -n text-generation-inference python=3.11
-conda activate text-generation-inference
-```
-
-You may also need to install Protoc.
-
-On Linux:
-
-```shell
-PROTOC_ZIP=protoc-21.12-linux-x86_64.zip
-curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP
-sudo unzip -o $PROTOC_ZIP -d /usr/local bin/protoc
-sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
-rm -f $PROTOC_ZIP
-```
-
-On MacOS, using Homebrew:
-
-```shell
-brew install protobuf
-```
-
-Then run:
-
-```shell
-BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork with CUDA kernels
-text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
-```
-
-**Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:
-
-```shell
-sudo apt-get install libssl-dev gcc -y
-```
-
-## Optimized architectures
-
-TGI works out of the box to serve optimized models for all modern models. They can be found in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).
-
-Other architectures are supported on a best-effort basis using:
-
-`AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
-
-or
-
-`AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`
-
-
-
-## Run locally
-
-### Run
-
-```shell
-text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
-```
-
-### Quantization
-
-You can also quantize the weights with bitsandbytes to reduce the VRAM requirement:
-
-```shell
-text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize
-```
-
-4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.
-
-## Develop
-
-```shell
-make server-dev
-make router-dev
-```
-
-## Testing
-
-```shell
-# python
-make python-server-tests
-make python-client-tests
-# or both server and client tests
-make python-tests
-# rust cargo tests
-make rust-tests
-# integration tests
-make integration-tests
-```
diff --git a/router/client/Cargo.toml b/backends/client/Cargo.toml
similarity index 100%
rename from router/client/Cargo.toml
rename to backends/client/Cargo.toml
diff --git a/router/client/build.rs b/backends/client/build.rs
similarity index 100%
rename from router/client/build.rs
rename to backends/client/build.rs
diff --git a/router/client/src/lib.rs b/backends/client/src/lib.rs
similarity index 100%
rename from router/client/src/lib.rs
rename to backends/client/src/lib.rs
diff --git a/router/client/src/v2/client.rs b/backends/client/src/v2/client.rs
similarity index 100%
rename from router/client/src/v2/client.rs
rename to backends/client/src/v2/client.rs
diff --git a/router/client/src/v2/mod.rs b/backends/client/src/v2/mod.rs
similarity index 100%
rename from router/client/src/v2/mod.rs
rename to backends/client/src/v2/mod.rs
diff --git a/router/client/src/v2/sharded_client.rs b/backends/client/src/v2/sharded_client.rs
similarity index 100%
rename from router/client/src/v2/sharded_client.rs
rename to backends/client/src/v2/sharded_client.rs
diff --git a/router/client/src/v3/client.rs b/backends/client/src/v3/client.rs
similarity index 96%
rename from router/client/src/v3/client.rs
rename to backends/client/src/v3/client.rs
index a996b14fae873c552458a9cc85cb78a0bfb6d541..d43f789e7ca95421266a068e267d65e000f309ea 100644
--- a/router/client/src/v3/client.rs
+++ b/backends/client/src/v3/client.rs
@@ -153,9 +153,13 @@ impl Client {
                 }),
                 // We truncate the input on the server side to be sure that it has the correct size
                 truncate,
+                // Most request will have that
+                add_special_tokens: true,
                 // Blocks and slots will be set on the server side if we use paged attention
                 blocks: vec![],
                 slots: vec![],
+                cache_len: 0,
+                chunk_len: None,
                 // Set sampling parameters to also take these ops into account in the max memory
                 parameters: Some(NextTokenChooserParameters {
                     temperature: 0.9,
@@ -214,8 +218,13 @@ impl Client {
     pub async fn prefill(
         &mut self,
         batch: Batch,
+        cached_batch: Option<CachedBatch>,
     ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
-        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let request = tonic::Request::new(PrefillRequest {
+            batch: Some(batch),
+            cached_batch,
+        })
+        .inject_context();
         let response = self.stub.prefill(request).await?.into_inner();
         Ok((
             response.generations,
diff --git a/router/client/src/v3/mod.rs b/backends/client/src/v3/mod.rs
similarity index 100%
rename from router/client/src/v3/mod.rs
rename to backends/client/src/v3/mod.rs
diff --git a/router/client/src/v3/sharded_client.rs b/backends/client/src/v3/sharded_client.rs
similarity index 96%
rename from router/client/src/v3/sharded_client.rs
rename to backends/client/src/v3/sharded_client.rs
index ae8a899b38a6b5571e4b61cba157deb73b5ab8c3..854a5895ebab7474b2391dead5078b19e3a05370 100644
--- a/router/client/src/v3/sharded_client.rs
+++ b/backends/client/src/v3/sharded_client.rs
@@ -134,11 +134,12 @@ impl ShardedClient {
     pub async fn prefill(
         &mut self,
         batch: Batch,
+        cached_batch: Option<CachedBatch>,
     ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
         let futures: Vec<_> = self
             .clients
             .iter_mut()
-            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone())))
             .collect();
         #[allow(clippy::type_complexity)]
         let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
@@ -221,6 +222,7 @@ impl Health for ShardedClient {
                 chunks: vec![Chunk::Text("liveness".into()).into()],
             }),
             truncate: 10,
+            add_special_tokens: true,
             prefill_logprobs: false,
             parameters: Some(NextTokenChooserParameters {
                 temperature: 1.0,
@@ -244,6 +246,8 @@ impl Health for ShardedClient {
             // Block 0 is reserved for health checks
             blocks: vec![0],
             slots: (0..16).collect(),
+            cache_len: 0,
+            chunk_len: None,
             adapter_id: None,
         };
         let batch = Batch {
@@ -253,7 +257,7 @@ impl Health for ShardedClient {
             max_tokens: 2,
             max_blocks: 1,
         };
-        self.clone().prefill(batch).await?;
+        self.clone().prefill(batch, None).await?;
         Ok(())
     }
 }
diff --git a/router/grpc-metadata/Cargo.toml b/backends/grpc-metadata/Cargo.toml
similarity index 100%
rename from router/grpc-metadata/Cargo.toml
rename to backends/grpc-metadata/Cargo.toml
diff --git a/router/grpc-metadata/src/lib.rs b/backends/grpc-metadata/src/lib.rs
similarity index 100%
rename from router/grpc-metadata/src/lib.rs
rename to backends/grpc-metadata/src/lib.rs
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..831372cdf99650041bcc51d2e438ccc2c7893a4f
--- /dev/null
+++ b/backends/trtllm/CMakeLists.txt
@@ -0,0 +1,75 @@
+cmake_minimum_required(VERSION 3.20)
+
+if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+    find_program(CCACHE_EXECUTABLE "ccache")
+    if (CCACHE_EXECUTABLE)
+        message(STATUS "Using ccache")
+        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
+    endif ()
+endif ()
+
+if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+    cmake_policy(SET CMP0135 NEW)
+endif ()
+
+project(tgi-trtllm-backend VERSION 1.0.0)
+set(CMAKE_CXX_STANDARD 20)
+
+include(FetchContent)
+include(ExternalProject)
+
+option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
+option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
+set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
+set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
+
+# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
+find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+
+#### External dependencies ####
+include(cmake/fmt.cmake)
+include(cmake/json.cmake)
+include(cmake/spdlog.cmake)
+include(cmake/trtllm.cmake)
+
+# Let's build TRTLLM as part of CMake
+add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
+
+# Tell CMake to need try to override the RPATH for executorWorker as it has not information on how to do so
+set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
+
+# TGI TRTLLM Backend definition
+add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
+include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+target_include_directories(tgi_trtllm_backend_impl PRIVATE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        $<INSTALL_INTERFACE:include>
+)
+target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
+target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
+
+# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
+install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
+install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+
+#### Unit Tests ####
+if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
+    message(STATUS "Building tests")
+    FetchContent_Declare(
+            Catch2
+            GIT_REPOSITORY https://github.com/catchorg/Catch2
+            GIT_TAG v3.6.0
+    )
+    FetchContent_MakeAvailable(Catch2)
+
+    #    add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
+    #    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
+
+    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+    include(CTest)
+    include(Catch)
+    #    catch_discover_tests(tgi_trtllm_backend_tests)
+endif ()
diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..97ef1a768917160d744ad92a46e88a7a54d492a4
--- /dev/null
+++ b/backends/trtllm/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "text-generation-backends-trtllm"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[dependencies]
+async-trait = "0.1"
+async-stream = "0.3"
+clap = { version = "4.5", features = ["derive"] }
+cxx = "1.0"
+hashbrown = "0.14"
+hf-hub = { workspace = true }
+log = { version = "0.4", features = [] }
+text-generation-router = { path = "../../router" }
+tokenizers = { workspace = true }
+tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio-stream = "0.1.15"
+thiserror = "1.0.63"
+tracing = "0.1"
+tracing-opentelemetry = "0.25"
+tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+
+[build-dependencies]
+cmake = "0.1"
+cxx-build = { version = "1.0", features = ["parallel"] }
+pkg-config = "0.3"
diff --git a/backends/trtllm/README.md b/backends/trtllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..94064504d74869c38b70165dc8a3bbf85eafa26d
--- /dev/null
+++ b/backends/trtllm/README.md
@@ -0,0 +1,46 @@
+# Text Generation Inference - TensorRT-LLM Backend Implementation
+
+## Description
+
+This folder provides the sources of the TensorRT-LLM backend implementation powered by TensorRT-LLM Executor new API
+
+## Simplified Request Sequence
+
+```mermaid
+sequenceDiagram
+    actor User
+    participant TextGenerationInference.HttpServer
+    participant TextGenerationInference.TensorRtLlmBackend
+    participant TextGenerationInference.TensorRtLlmWorkerThread
+    participant TensorRtLlm.Executor
+    participant Nvidia.Gpu
+    User ->> TextGenerationInference.HttpServer: POST /generate
+    TextGenerationInference.HttpServer ->> TextGenerationInference.TensorRtLlmBackend: Validate and forward inputs & parameters
+    TextGenerationInference.TensorRtLlmBackend ->> TextGenerationInference.TensorRtLlmWorkerThread: Allocate a new context and spawn a new thread to handle the request
+    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Submit the request to the In-Flight Batcher
+    activate Nvidia.Gpu
+    TensorRtLlm.Executor ->> Nvidia.Gpu: Add the request to the poll for execution
+    TensorRtLlm.Executor -->> TextGenerationInference.TensorRtLlmWorkerThread: Response with an unique request identifier
+    rect rgb(10, 92, 54)
+        loop every 100us
+            rect rgb(15, 81, 50)
+                alt Acquire lock to query executor
+                    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Poll request number of new token(s) generated
+                else There are new generated tokens
+                    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Retrieve newly generated tokens
+                    TensorRtLlm.Executor -->> TextGenerationInference.TensorRtLlmWorkerThread: Return decoded token information and potential error (omitted)
+                    rect rgb(11, 110, 79)
+                        alt Generated token is final
+                            TensorRtLlm.Executor ->> Nvidia.Gpu: Remove request from the scheduler and from the GPU
+                            TextGenerationInference.TensorRtLlmWorkerThread -->> User: Stream the remaining decoded tokens and flush the connection
+                        else Generated token is not final
+                            TextGenerationInference.TensorRtLlmWorkerThread -->> User: Stream token back to the user as they get decoded
+                        end
+                    end
+                end
+            end
+            deactivate Nvidia.Gpu
+        end
+    end
+
+```
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..985019260b837035907b7e8af5eb20688e85301e
--- /dev/null
+++ b/backends/trtllm/build.rs
@@ -0,0 +1,160 @@
+use cxx_build::CFG;
+use pkg_config;
+use std::env;
+use std::env::consts::ARCH;
+use std::path::{absolute, PathBuf};
+
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
+const CUDA_REQUIRED_VERSION: &str = "12.6";
+const MPI_REQUIRED_VERSION: &str = "4.1";
+const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
+const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
+const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
+
+// Dependencies
+const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"];
+const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
+const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
+    ("dylib", "tensorrt_llm"),
+    ("static", "tensorrt_llm_executor_static"),
+    ("dylib", "tensorrt_llm_nvrtc_wrapper"),
+    ("dylib", "nvinfer_plugin_tensorrt_llm"),
+    ("dylib", "decoder_attention"),
+];
+
+macro_rules! probe {
+    ($name: expr, $version: expr) => {
+        if let Err(_) = pkg_config::probe_library($name) {
+            pkg_config::probe_library(&format!("{}-{}", $name, $version))
+                .expect(&format!("Failed to locate {}", $name));
+        }
+    };
+}
+
+fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
+    // Build the backend implementation through CMake
+    let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
+    let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
+    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real");
+
+    let mut install_path = PathBuf::from(install_path);
+    if !install_path.is_absolute() {
+        install_path = absolute(out_dir).expect("cannot happen").join(install_path);
+    }
+
+    let _ = cmake::Config::new(".")
+        .uses_cxx11()
+        .generator("Ninja")
+        .profile(match is_debug {
+            true => "Debug",
+            false => "Release",
+        })
+        .env("OPT_LEVEL", opt_level)
+        .define("CMAKE_INSTALL_PREFIX", &install_path)
+        .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
+        .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
+        .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
+        .build();
+
+    // Additional transitive CMake dependencies
+    let deps_folder = out_dir.join("build").join("_deps");
+    for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES {
+        let dep_name = match is_debug {
+            true => format!("{}d", dependency),
+            false => String::from(dependency),
+        };
+        let dep_path = deps_folder.join(format!("{}-build", dependency));
+        println!("cargo:rustc-link-search={}", dep_path.display());
+        println!("cargo:rustc-link-lib=static={}", dep_name);
+    }
+
+    // Emit linkage information from the artifacts we just built
+    let install_lib_path = install_path.join("lib");
+
+    println!(
+        r"cargo:warning=Adding link search path: {}",
+        install_lib_path.display()
+    );
+    println!(r"cargo:rustc-link-search={}", install_lib_path.display());
+
+    (PathBuf::from(install_path), deps_folder)
+}
+
+fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
+    let ndebug = match is_debug {
+        true => "1",
+        false => "0",
+    };
+
+    CFG.include_prefix = "backends/trtllm";
+    cxx_build::bridge("src/lib.rs")
+        .static_flag(true)
+        .include(deps_folder.join("fmt-src").join("include"))
+        .include(deps_folder.join("spdlog-src").join("include"))
+        .include(deps_folder.join("json-src").join("include"))
+        .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
+        .include("/usr/local/cuda/include")
+        .include("/usr/local/tensorrt/include")
+        .file("src/ffi.cpp")
+        .std("c++20")
+        .define("NDEBUG", ndebug)
+        .compile("tgi_trtllm_backend");
+
+    println!("cargo:rerun-if-changed=CMakeLists.txt");
+    println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
+    println!("cargo:rerun-if-changed=cmake/json.cmake");
+    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
+    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
+    println!("cargo:rerun-if-changed=include/backend.h");
+    println!("cargo:rerun-if-changed=lib/backend.cpp");
+    println!("cargo:rerun-if-changed=include/ffi.h");
+    println!("cargo:rerun-if-changed=src/ffi.cpp");
+}
+
+fn main() {
+    // Misc variables
+    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+    let build_profile = env::var("PROFILE").unwrap();
+    let (is_debug, opt_level) = match build_profile.as_ref() {
+        "debug" => (true, "0"),
+        _ => (false, "3"),
+    };
+
+    // Build the backend
+    let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);
+
+    // Build the FFI layer calling the backend above
+    build_ffi_layer(&deps_folder, is_debug);
+
+    // Emit linkage search path
+    probe!("ompi", MPI_REQUIRED_VERSION);
+
+    // Probe CUDA & co. with pkg-config
+    CUDA_TRANSITIVE_DEPS.iter().for_each(|name| {
+        probe!(name, CUDA_REQUIRED_VERSION);
+    });
+
+    // NCCL is slightly trickier because it might not have a pkgconfig installed
+    let nccl_library_path_default = format!("/usr/local/{}-linux-gnu", ARCH);
+    let nccl_library_path = NCCL_ROOT_DIR.unwrap_or(&nccl_library_path_default);
+    println!(r"cargo:rustc-link-search=native={}", nccl_library_path);
+    println!("cargo:rustc-link-lib=dylib=nccl");
+
+    // TensorRT
+    let tensort_library_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt/lib");
+    println!(r"cargo:rustc-link-search=native={}", tensort_library_path);
+    println!("cargo:rustc-link-lib=dylib=nvinfer");
+
+    // TensorRT-LLM
+    TENSORRT_LLM_TRANSITIVE_DEPS
+        .iter()
+        .for_each(|(link_type, name)| {
+            println!("cargo:rustc-link-lib={}={}", link_type, name);
+        });
+
+    // Backend
+    BACKEND_DEPS.iter().for_each(|name| {
+        println!("cargo:rustc-link-lib=static={}", name);
+    });
+}
diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..afd6ea5f0906e31a491d2113d6527cd9d6a9850e
--- /dev/null
+++ b/backends/trtllm/cmake/fmt.cmake
@@ -0,0 +1,6 @@
+FetchContent_Declare(
+        fmt
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
+)
+FetchContent_MakeAvailable(fmt)
diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..67eff2fe606708a4561b2f9ad3f19441c561a92d
--- /dev/null
+++ b/backends/trtllm/cmake/json.cmake
@@ -0,0 +1,6 @@
+fetchcontent_declare(
+        json
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
+)
+fetchcontent_makeavailable(json)
diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7f529a7d29e5ab39d9af9b233399f9cc3da1d9da
--- /dev/null
+++ b/backends/trtllm/cmake/spdlog.cmake
@@ -0,0 +1,17 @@
+set(SPDLOG_USE_FMT ON)
+set(SPDLOG_BUILD_SHARED OFF)
+set(SPDLOG_FMT_EXTERNAL ON)
+
+# Define the level at which SPDLOG_ compilation level is defined
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+else ()
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+endif ()
+
+fetchcontent_declare(
+        spdlog
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
+)
+fetchcontent_makeavailable(spdlog)
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..5f1b6c19c01f25e31a5703d75f6f8774b1a7250b
--- /dev/null
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -0,0 +1,43 @@
+set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
+
+set(USE_CXX11_ABI ON)
+set(BUILD_PYT OFF)
+set(BUILD_PYBIND OFF)
+set(BUILD_MICRO_BENCHMARKS OFF)
+set(BUILD_BENCHMARKS OFF)
+set(BUILD_TESTS OFF)
+set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
+
+message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    set(FAST_BUILD ON)
+    set(NVTX_DISABLE OFF)
+else ()
+    set(FAST_BUILD OFF)
+    set(FAST_MATH ON)
+    set(NVTX_DISABLE ON)
+endif ()
+
+fetchcontent_declare(
+        trtllm
+        GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
+        GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
+        GIT_SHALLOW FALSE
+        DOWNLOAD_EXTRACT_TIMESTAMP
+)
+fetchcontent_makeavailable(trtllm)
+
+message(STATUS "Found TensorRT-LLM: ${trtllm_SOURCE_DIR}")
+execute_process(COMMAND git lfs install WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
+execute_process(COMMAND git lfs pull WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
+
+# TRTLLM use a JIT based *precompiled* library to generate some specific kernels, we are generating the path to this one here
+set(TRTLLM_NVRTC_LIBRARY_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}tensorrt_llm_nvrtc_wrapper${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE INTERNAL "nvrtc wrapper library name")
+set(TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH "${trtllm_SOURCE_DIR}/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${CMAKE_LIBRARY_ARCHITECTURE}/${TRTLLM_NVRTC_LIBRARY_NAME}"
+        CACHE INTERNAL "nvrtc wrapper library path")
+
+# The same Executor Static library
+set(TRTLLM_EXECUTOR_STATIC_LIBRARY_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}tensorrt_llm_executor_static${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE INTERNAL "executor_static library name")
+set(TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH "${trtllm_SOURCE_DIR}/cpp/tensorrt_llm/executor/${CMAKE_LIBRARY_ARCHITECTURE}/${TRTLLM_EXECUTOR_STATIC_LIBRARY_NAME}" CACHE INTERNAL "executor_static library path")
diff --git a/server/marlin/marlin_kernels/py.typed b/backends/trtllm/cmake/utils/detect_cuda_arch.cu
similarity index 100%
rename from server/marlin/marlin_kernels/py.typed
rename to backends/trtllm/cmake/utils/detect_cuda_arch.cu
diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..d23f6288964d74a6c71e11bc13b3b235a4d592fa
--- /dev/null
+++ b/backends/trtllm/include/backend.h
@@ -0,0 +1,144 @@
+//
+// Created by Morgan Funtowicz on 6/30/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_H
+#define TGI_TRTLLM_BACKEND_H
+
+#include <array>
+#include <cmath>
+#include <filesystem>
+#include <span>
+#include <vector>
+
+#include <nlohmann/json.hpp>
+
+#include <tensorrt_llm/runtime/common.h>
+#include <tensorrt_llm/executor/executor.h>
+#include <tensorrt_llm/plugins/api/tllmPlugin.h>
+
+using json = nlohmann::json;
+namespace tle = tensorrt_llm::executor;
+
+
+#define CAST_SIZETYPE(x) static_cast<tle::SizeType32>(x)
+
+namespace huggingface::tgi::backends {
+    using RequestId = tle::IdType;
+    using TokenId = tle::TokenIdType;
+
+    const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
+    constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
+            "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
+    constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
+            "Submitting inference [{}] to the executor ({:d} already in-flight)");
+    constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
+            "Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}");
+
+    /**
+     * Initialize all the components required by TRTLLM.
+     * It is required to call this function before attempting to load any engine
+     */
+    void InitializeBackend();
+
+    /**
+     * Initialize logging mechanism
+     */
+    void InitializeLogging();
+
+
+    /**
+     *
+     * @param config TensorRT-LLM configuration object
+     * @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode
+     * @return
+     */
+    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
+
+    /**
+     *
+     * @param worldSize
+     * @param workerPath
+     * @return
+     */
+    tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
+
+    /**
+     * Get the sampling configuration from the parameters provided by TGI
+     * @param topK
+     * @param topP
+     * @param temperature
+     * @param repetition_penalty
+     * @param frequency_penalty
+     * @param seed
+     * @return
+     */
+    tle::SamplingConfig GetSamplingConfig(
+            uint32_t topK,
+            float_t topP,
+            float_t temperature,
+            float_t repetition_penalty,
+            float_t frequency_penalty,
+            uint64_t seed
+    ) noexcept;
+
+    /**
+     * Attempt to retrieve the
+     * @param generationConfigPath
+     * @return
+     */
+    std::optional<std::list<std::vector<TokenId>>>
+    GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept;
+
+    /**
+     *
+     */
+    class TensorRtLlmBackend {
+    private:
+        const json config;
+        tle::Executor executor;
+
+        /** Frequently accessed variables cached here **/
+        uint32_t maxNumTokens;
+        std::list<std::vector<TokenId>> stopWords;
+
+    public:
+        explicit TensorRtLlmBackend(
+                const std::filesystem::path &engineFolder,
+                const std::filesystem::path &executorWorker
+        );
+
+        /**
+         * Query the executor for the number of token available for pulling
+         * @return
+         */
+        [[nodiscard]] size_t NumResponsesReady() const;
+
+        /**
+         * Submit a new generation task to the executor
+         * @param tokens
+         * @param topK
+         * @param topP
+         * @param temperature
+         * @param repetitionPenalty
+         * @param frequencyPenalty
+         * @param seed
+         * @return Request id related to this generation for reference
+         */
+        [[nodiscard]] RequestId Submit(
+                const std::vector<TokenId> &tokens,
+                uint32_t maxNewTokens,
+                int32_t topK,
+                float_t topP,
+                float_t temperature,
+                float_t repetitionPenalty,
+                float_t frequencyPenalty,
+                uint64_t seed
+        );
+
+        [[nodiscard]] std::vector<tle::Response> PullNewTokens();
+    };
+}
+
+
+#endif //TGI_TRTLLM_BACKEND_H
diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h
new file mode 100644
index 0000000000000000000000000000000000000000..449bcd4d7398146867628825dfe182419b5666e6
--- /dev/null
+++ b/backends/trtllm/include/ffi.h
@@ -0,0 +1,75 @@
+//
+// Created by mfuntowicz on 7/11/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_FFI_H
+#define TGI_TRTLLM_BACKEND_FFI_H
+
+#include <cmath>
+#include <cstddef>
+#include <memory>
+#include "backend.h"
+
+namespace huggingface::tgi::backends {
+    class TensorRtLlmBackendImpl;
+}
+
+// Template to support returning error from TllmException back to Rust in a Result<>
+#include <tensorrt_llm/common/tllmException.h>
+
+namespace rust::behavior {
+    template<typename Try, typename Fail>
+    static void trycatch(Try &&func, Fail &&fail) noexcept try {
+        func();
+    } catch (tensorrt_llm::common::TllmException &e) {
+        fail(e.what());
+    }
+}
+
+#include "backends/trtllm/src/lib.rs.h"
+
+namespace huggingface::tgi::backends {
+
+    class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
+    public:
+        /***
+         *
+         * @param engineFolder
+         * @param executorWorker
+         */
+        TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
+
+        /***
+         *
+         * @param tokens
+         * @param maxNewTokens
+         * @param topK
+         * @param topP
+         * @param temperature
+         * @param repetition_penalty
+         * @param frequency_penalty
+         * @param seed
+         * @return
+         */
+        [[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
+        uint64_t
+        Submit(rust::Slice<const uint32_t> tokens, uint32_t maxNewTokens,
+               int32_t topK, float_t topP, float_t temperature,
+               float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
+
+        /***
+         *
+         * @return
+         */
+        std::unique_ptr<std::vector<GenerationStep>> PullTokens();
+    };
+
+    /***
+    *
+    * @param engineFolder
+    * @return
+    */
+    std::unique_ptr<TensorRtLlmBackendImpl> CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker);
+}
+
+#endif //TGI_TRTLLM_BACKEND_FFI_H
diff --git a/backends/trtllm/include/hardware.h b/backends/trtllm/include/hardware.h
new file mode 100644
index 0000000000000000000000000000000000000000..9633495f4fd54682da6508936b9cb853e221411d
--- /dev/null
+++ b/backends/trtllm/include/hardware.h
@@ -0,0 +1,59 @@
+//
+// Created by mfuntowicz on 7/23/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
+#define TGI_TRTLLM_BACKEND_HARDWARE_H
+
+#include <cstdint>
+#include <limits>
+#include <fmt/base.h>
+#include <spdlog/spdlog.h>
+#include <nvml.h>
+
+namespace huggingface::hardware::cuda {
+
+#define AMPERE_SM_MAJOR 8
+#define HOPPER_SM_MAJOR 9
+
+    /**
+     * Store information about the version of the CUDA Compute Capabilities detected on the device
+     */
+    struct CudaComputeCapabilities {
+        int32_t major;
+        int32_t minor;
+
+        [[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
+
+        [[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
+    };
+
+    CudaComputeCapabilities GetCudaComputeCapabilities() {
+        // Get the compute capabilities of the current hardware
+        nvmlDevice_t device;
+        CudaComputeCapabilities capabilities{0, 0};
+        if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
+            SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
+            if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
+                SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
+            }
+        }
+
+        return capabilities;
+    }
+
+    /**
+     * Return the number of GPU detected. If no GPU is detected, return size_t::max()
+     * @return
+     */
+    std::optional<size_t> GetNumDevices() {
+        uint32_t numGpus = 0;
+        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
+            return std::optional(numGpus);
+        } else {
+            return std::nullopt;
+        }
+    }
+}
+
+#endif //TGI_TRTLLM_BACKEND_HARDWARE_H
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4dd41de0072866d818a3c901d813f0344a6af683
--- /dev/null
+++ b/backends/trtllm/lib/backend.cpp
@@ -0,0 +1,203 @@
+#include <cstdlib>
+#include <fstream>
+
+#include <fmt/ranges.h>
+#include <spdlog/spdlog.h>
+#include <nvml.h>
+
+#include "backend.h"
+#include "hardware.h"
+
+
+void huggingface::tgi::backends::InitializeLogging() {
+#ifdef NDEBUG
+    if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
+        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
+        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
+            return std::tolower(c);
+        });
+
+        if (log_level == "debug")
+            spdlog::set_level(spdlog::level::debug);
+        else
+            spdlog::set_level(spdlog::level::info);
+    }
+#else
+    spdlog::set_level(spdlog::level::debug);
+#endif
+}
+
+void huggingface::tgi::backends::InitializeBackend() {
+    SPDLOG_INFO("Initializing Backend...");
+    nvmlInit_v2();
+    initTrtLlmPlugins();
+
+    InitializeLogging();
+
+    SPDLOG_INFO("Backend Executor Version: {}", tle::version());
+    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
+    if (numGpus.has_value()) {
+        SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
+    } else {
+        SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
+    }
+}
+
+[[nodiscard]]
+tle::ParallelConfig
+huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept {
+    auto mode = tle::CommunicationMode::kLEADER;
+    std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
+
+    if (worldSize > 1) {
+        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
+        mode = tle::CommunicationMode::kORCHESTRATOR;
+        orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
+    } else {
+        SPDLOG_INFO("Detected single engine deployment, using leader mode");
+    }
+
+    return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
+}
+
+[[nodiscard]]
+tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
+    tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
+
+    // Retrieve the compute capabilities to enable some options at runtime
+    const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
+
+    // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
+    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
+    execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath));
+
+    // Define some configuration variables
+    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
+    execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere());
+    execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
+    return execConfig;
+}
+
+tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
+        const uint32_t topK,
+        const float_t topP,
+        const float_t temperature,
+        const float_t repetition_penalty,
+        const float_t frequency_penalty,
+        const uint64_t seed) noexcept {
+
+    return tle::SamplingConfig(
+            1,  // TGI only use a single beam
+            topK,
+            topP,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            seed,
+            temperature,
+            temperature,
+            std::nullopt,
+            repetition_penalty,
+            std::nullopt,
+            frequency_penalty
+    );
+}
+
+std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
+huggingface::tgi::backends::GetStopWordsFromConfig(
+        const std::filesystem::path &generationConfigPath) noexcept {
+    if (exists(generationConfigPath)) {
+        const auto generationConfig = json::parse(std::ifstream(generationConfigPath));
+        if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) {
+            SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size());
+            std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords(eosTokenIds.size());
+
+            const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type {
+                return {tokenIdObj.template get<tle::TokenIdType>()};
+            };
+
+            std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token);
+            return stopWords;
+        } else {
+            SPDLOG_INFO("Invalid EOS tokens entry found (not an array)");
+        }
+    } else {
+        SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist");
+    }
+
+    return std::nullopt;
+}
+
+huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
+        const std::filesystem::path &enginesFolder,
+        const std::filesystem::path &executorWorker
+) :
+        config(json::parse(std::ifstream(enginesFolder / "config.json"))),
+        executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY,
+                 GetExecutorConfig(config, executorWorker.string())) {
+
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get<std::string_view>());
+
+    // Ensure we have enough GPUs on the system
+    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
+    const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0);
+    if (numGpus < worldSize) {
+        SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
+        // todo : raise exception to catch on rust side
+    }
+
+    // Cache variables
+    maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<uint32_t>();
+
+    // Attempt to discover stopWords from the generation_config.json
+    const auto generationConfigPath = enginesFolder / "generation_config.json";
+    stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list<std::vector<TokenId>>());
+}
+
+[[nodiscard("Returned number of requests needs to be consumed")]]
+size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
+#ifdef NDEBUG
+    return executor.getNumResponsesReady();
+#else
+    const auto numResponses = executor.getNumResponsesReady();
+    if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses);
+    return numResponses;
+#endif
+}
+
+[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
+tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
+        const std::vector<tle::TokenIdType> &tokens,
+        const uint32_t maxNewTokens,
+        const int32_t topK,
+        const float_t topP,
+        const float_t temperature,
+        const float_t repetitionPenalty,
+        const float_t frequencyPenalty,
+        const uint64_t seed
+) {
+    const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast<uint32_t>(maxNumTokens - tokens.size()));
+#ifndef NDEBUG
+    {
+        const auto &iterations = executor.getLatestIterationStats();
+        const auto &lastIteration = iterations.front();
+
+        SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests);
+        SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
+        SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked);
+    }
+#endif
+
+    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
+
+    // Build the request
+    auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG};
+    request.setStopWords(stopWords);
+
+    // Submit to the executor for batching
+    return executor.enqueueRequest(request);
+}
+
+std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() {
+    return executor.awaitResponses();
+}
diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4c2dc26b6bfbdf900edfbc178467b4ffddfd3bae
--- /dev/null
+++ b/backends/trtllm/scripts/install_tensorrt.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+set -ex
+
+TRT_VER_BASE="10.4.0"
+TRT_VER_FULL="${TRT_VER_BASE}.26"
+CUDA_VER="12.6"
+CUDNN_VER="9.5.0.50-1"
+NCCL_VER="2.22.3-1+cuda12.6"
+CUBLAS_VER="12.6.3.3-1"
+NVRTC_VER="12.6.77-1"
+
+for i in "$@"; do
+    case $i in
+        --TRT_VER=?*) TRT_VER="${i#*=}";;
+        --CUDA_VER=?*) CUDA_VER="${i#*=}";;
+        --CUDNN_VER=?*) CUDNN_VER="${i#*=}";;
+        --NCCL_VER=?*) NCCL_VER="${i#*=}";;
+        --CUBLAS_VER=?*) CUBLAS_VER="${i#*=}";;
+        *) ;;
+    esac
+    shift
+done
+
+NVCC_VERSION_OUTPUT=$(nvcc --version)
+if [[ $(echo $NVCC_VERSION_OUTPUT | grep -oP "\d+\.\d+" | head -n 1) != ${CUDA_VER} ]]; then
+  echo "The version of pre-installed CUDA is not equal to ${CUDA_VER}."
+  exit 1
+fi
+
+install_ubuntu_requirements() {
+    apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates
+    ARCH=$(uname -m)
+    if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
+    if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi
+    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH}/cuda-keyring_1.1-1_all.deb
+    dpkg -i cuda-keyring_1.1-1_all.deb
+    rm /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list
+
+    apt-get update
+    if [[ $(apt list --installed | grep libcudnn9) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libcudnn9*
+    fi
+    if [[ $(apt list --installed | grep libnccl) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libnccl*
+    fi
+    if [[ $(apt list --installed | grep libcublas) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libcublas*
+    fi
+    if [[ $(apt list --installed | grep cuda-nvrtc-dev) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
+    fi
+    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    apt-get install -y --no-install-recommends libcudnn9-cuda-12=${CUDNN_VER} libcudnn9-dev-cuda-12=${CUDNN_VER}
+    apt-get install -y --no-install-recommends libnccl2=${NCCL_VER} libnccl-dev=${NCCL_VER}
+    apt-get install -y --no-install-recommends libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER}
+    # NVRTC static library doesn't exist in NGC PyTorch container.
+    NVRTC_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+}
+
+install_centos_requirements() {
+    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    yum -y update
+    yum -y install epel-release
+    yum remove -y libnccl* && yum -y install libnccl-${NCCL_VER} libnccl-devel-${NCCL_VER}
+    yum remove -y libcublas* && yum -y install libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER} libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}
+    yum clean all
+}
+
+install_tensorrt() {
+    #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
+    #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
+    TRT_CUDA_VERSION="12.6"
+
+    if [ -z "$RELEASE_URL_TRT" ];then
+        ARCH=${TRT_TARGETARCH}
+        if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
+        if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
+        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
+        if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
+        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${TRT_VER_BASE}/tars/TensorRT-${TRT_VER_FULL}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
+    fi
+    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
+    tar -xf /tmp/TensorRT.tar -C /usr/local/
+    mv /usr/local/TensorRT-${TRT_VER_FULL} /usr/local/tensorrt
+    # pip3 install /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl
+    rm -rf /tmp/TensorRT.tar
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  debian)
+    install_ubuntu_requirements
+    install_tensorrt
+    ;;
+  ubuntu)
+    install_ubuntu_requirements
+    install_tensorrt
+    ;;
+  centos)
+    install_centos_requirements
+    install_tensorrt
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
diff --git a/backends/trtllm/src/errors.rs b/backends/trtllm/src/errors.rs
new file mode 100644
index 0000000000000000000000000000000000000000..812fd6e30d8b2ca2b0fad631c4fdf94d23e1b6b8
--- /dev/null
+++ b/backends/trtllm/src/errors.rs
@@ -0,0 +1,22 @@
+use std::path::PathBuf;
+use thiserror::Error;
+
+use text_generation_router::server;
+
+#[derive(Debug, Error)]
+pub enum TensorRtLlmBackendError {
+    #[error("Provided engine folder {0} doesn't exist")]
+    EngineFolderDoesntExists(PathBuf),
+    #[error("Provided executorWorker binary path {0} doesn't exist")]
+    ExecutorWorkerNotFound(PathBuf),
+    #[error("TensorRT-LLM Runtime error: {0}")]
+    Runtime(String),
+    #[error("Tokenizer error: {0}")]
+    Tokenizer(String),
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Tokio runtime failed to start: {0}")]
+    Tokio(#[from] std::io::Error),
+}
diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a92c050f653f8c294f76795bb193d7428bd2633
--- /dev/null
+++ b/backends/trtllm/src/ffi.cpp
@@ -0,0 +1,89 @@
+//
+// Created by mfuntowicz on 6/30/24.
+//
+#pragma once
+
+#include <algorithm>
+#include <exception>
+#include <filesystem>
+#include <functional>
+#include <limits>
+#include <iterator>
+#include <ranges>
+#include <vector>
+
+#include <spdlog/spdlog.h>
+#include "backends/trtllm/include/ffi.h"
+
+
+huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
+        const std::string_view &engineFolder,
+        const std::string_view &executorWorker
+) : TensorRtLlmBackend(engineFolder, executorWorker) {}
+
+
+uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
+        rust::Slice<const uint32_t> tokens,
+        uint32_t maxNewTokens,
+        int32_t topK,
+        float_t topP,
+        float_t temperature,
+        float_t repetition_penalty,
+        float_t frequency_penalty,
+        uint64_t seed) {
+
+    // This will copy all the items from the initial slice
+    std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
+    return TensorRtLlmBackend::Submit(
+            std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
+}
+
+std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
+huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
+    const auto responses = TensorRtLlmBackend::PullNewTokens();
+
+    auto steps = std::make_unique<std::vector<GenerationStep>>();
+    steps->reserve(responses.size());
+
+#ifndef NDEBUG
+    SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
+#endif
+
+    // Transform tle::Response to GenerationStep
+    std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
+        const auto reqId = r.getRequestId();
+        if (!r.hasError()) {
+            const auto result = r.getResult();
+            return GenerationStep{
+                    reqId,
+                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
+                    result.logProbs.value()[0][0],
+                    result.isFinal,
+                    false,
+                    std::string()
+            };
+        } else {
+            return GenerationStep{
+                    reqId,
+                    0,
+                    0.0,
+                    true,
+                    true,
+                    std::move(r.getErrorMsg())
+            };
+        }
+    });
+
+    return steps;
+}
+
+std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
+huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
+    SPDLOG_INFO("Creating TensorRT-LLM Backend");
+    // Unconditionally call this to initialize and discover TRTLLM plugins
+    InitializeBackend();
+
+    const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
+    const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
+    return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
+}
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
new file mode 100644
index 0000000000000000000000000000000000000000..edd8caff154e94269ff921ef93cb4e721131a9ea
--- /dev/null
+++ b/backends/trtllm/src/lib.rs
@@ -0,0 +1,68 @@
+pub use looper::TensorRtLlmBackendV2;
+
+pub mod errors;
+mod looper;
+mod utils;
+
+#[cxx::bridge(namespace = "huggingface::tgi::backends")]
+mod ffi {
+    /// Struct used as shared type between rust and C++ to represent the result
+    /// of a single decoding iteration
+    #[derive(Debug, Clone)]
+    pub struct GenerationStep {
+        request_id: u64,
+        token_id: u32,
+        log_prob: f32,
+        is_final: bool,
+        has_error: bool,
+        error_msg: String,
+    }
+
+    unsafe extern "C++" {
+        include!("backends/trtllm/src/ffi.cpp");
+
+        /// Represent an instance of the underlying TensorRT-LLM backend
+        type TensorRtLlmBackendImpl;
+
+        /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
+        ///
+        /// # Arguments
+        ///
+        /// * `engine_folder`: Path to the folder containing all the TRTLLM engines
+        /// * `executor_worker`: Path to the TRTLLM executor worker
+        ///
+        /// returns: <unknown>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
+        #[rust_name = "create_tensorrt_llm_backend"]
+        fn CreateTensorRtLlmBackend(
+            engine_folder: &str,
+            executor_worker: &str,
+        ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
+
+        #[rust_name = "num_responses_ready"]
+        fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
+
+        #[rust_name = "submit"]
+        fn Submit(
+            self: Pin<&mut TensorRtLlmBackendImpl>,
+            tokens: &[u32],
+            max_new_tokens: u32,
+            top_k: i32,
+            top_p: f32,
+            temperature: f32,
+            repetition_penalty: f32,
+            frequency_penalty: f32,
+            seed: u64,
+        ) -> Result<u64>;
+
+        #[rust_name = "pull_tokens"]
+        fn PullTokens(
+            self: Pin<&mut TensorRtLlmBackendImpl>,
+        ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
+    }
+}
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
new file mode 100644
index 0000000000000000000000000000000000000000..e26155c163c910f46349adbe82838345d37289ce
--- /dev/null
+++ b/backends/trtllm/src/looper.rs
@@ -0,0 +1,382 @@
+use std::hint;
+use std::ops::Deref;
+use std::path::Path;
+
+use async_trait::async_trait;
+use cxx::UniquePtr;
+use hashbrown::HashMap;
+use tokenizers::Tokenizer;
+use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
+use tokio::sync::TryAcquireError;
+use tokio::task::{spawn_blocking, JoinHandle};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{debug, error, warn};
+
+use text_generation_router::infer::InferError::{GenerationError, ValidationError};
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidationError::{
+    EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality,
+};
+use text_generation_router::validation::{Chunk, ValidGenerateRequest};
+use text_generation_router::{FinishReason, Token};
+
+use crate::errors::TensorRtLlmBackendError;
+use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
+use crate::utils::first_line;
+
+type InferResult<T> = Result<T, InferError>;
+
+/// Wrap the requests along with the channel used to stream back to the client the decoded tokens
+struct GenerationContext {
+    request: ValidGenerateRequest,
+    start: Option<Instant>,
+    queued: Instant,
+    streamer: UnboundedSender<InferResult<InferStreamResponse>>,
+}
+
+#[derive(Debug, Copy, Clone)]
+struct DecodedToken {
+    id: u32,
+    log_prob: f32,
+    is_final: bool,
+}
+
+impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
+    type Error = InferError;
+
+    fn try_from(step: &'step GenerationStep) -> Result<Self, Self::Error> {
+        if !step.has_error {
+            Ok(Self {
+                id: step.token_id,
+                log_prob: step.log_prob,
+                is_final: step.is_final,
+            })
+        } else {
+            Err(GenerationError(step.error_msg.clone()))
+        }
+    }
+}
+
+/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens
+struct DecodedTokenContext {
+    token: DecodedToken,
+    start: Option<Instant>,
+    queued: Instant,
+    channel: UnboundedSender<InferResult<InferStreamResponse>>,
+}
+
+fn executor_status_looper(
+    mut backend: UniquePtr<TensorRtLlmBackendImpl>,
+    max_inflight_requests: usize,
+    mut waiting_requests: UnboundedReceiver<GenerationContext>,
+    post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,
+) {
+    // Track the tuple (request_id, stream) for each request
+    let mut in_flights =
+        HashMap::<u64, GenerationContext>::with_capacity(max_inflight_requests * 2);
+
+    // TODO: Does it need a spin-loop?
+    'scheduler: loop {
+        // Is there any request pending to be scheduled?
+        let awaiting_requests = waiting_requests.len();
+        for _ in 0..awaiting_requests {
+            // Retrieve all the requests
+            if let Some(mut ctx) = waiting_requests.blocking_recv() {
+                // Submit all the request to the executor and move the context to the in-flight tracker
+                let request = &ctx.request;
+                let generation_params = &request.parameters;
+                let stopping_params = &request.stopping_parameters;
+                let input_ids = request.input_ids.as_deref();
+
+                // Submit to the TensorRT-LLM executor for scheduling
+                match backend.pin_mut().submit(
+                    &input_ids.unwrap(), // This is checked beforehand in validate()
+                    stopping_params.max_new_tokens,
+                    generation_params.top_k as i32,
+                    generation_params.top_p,
+                    generation_params.temperature,
+                    generation_params.repetition_penalty,
+                    generation_params.frequency_penalty,
+                    generation_params.seed,
+                ) {
+                    Ok(request_id) => {
+                        // Insert the context linked to the generated request id in the tracker
+                        debug!("[in-flight] Added {}", request_id);
+                        ctx.start = Some(Instant::now());
+                        in_flights.insert(request_id, ctx);
+                    }
+                    Err(e) => {
+                        // Return to the caller
+                        let what = e.to_string();
+                        error!(error = what.as_str(), "Failed to schedule request");
+
+                        let err = Err(InferError::Overloaded(TryAcquireError::NoPermits));
+                        if let Err(_) = ctx.streamer.send(err) {
+                            error!("Failed to send back error to the client");
+                        }
+                    }
+                };
+            }
+        }
+
+        if backend.num_responses_ready() > 0 {
+            match backend.pin_mut().pull_tokens() {
+                Ok(responses) => {
+                    // Iterate through all the decoded token
+                    for step in responses.deref() {
+                        if let Some(ctx) = in_flights.get(&step.request_id) {
+                            // Remove from tracked requests
+                            let parcel =
+                                DecodedToken::try_from(step).map(|dt| DecodedTokenContext {
+                                    token: dt,
+                                    start: ctx.start,
+                                    queued: ctx.queued,
+                                    channel: ctx.streamer.clone(),
+                                });
+
+                            // Submit the work to p:the post_processor
+                            let posted = post_processor_sender.send((step.request_id, parcel));
+
+                            if posted.is_err() || step.is_final {
+                                debug!("Removing {}", step.request_id);
+                                let _ = in_flights.remove(&step.request_id);
+                            }
+                        } else {
+                            warn!("Untracked request {}", step.request_id,);
+                        }
+                    }
+                }
+                Err(ref err) => {
+                    error!("Failed to get responses from the executor: {}.", err.what());
+                    break 'scheduler;
+                }
+            }
+        }
+
+        // Hint the CPU we are spin-locking
+        hint::spin_loop();
+    }
+}
+
+fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
+    tokenizer: Tokenizer,
+    max_inflight_requests: usize,
+    mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,
+) {
+    let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(max_inflight_requests * 2);
+
+    'post_processor: loop {
+        if decoded_tokens.is_closed() {
+            warn!("Post processor IPC is closed, loop will exit now.");
+            break 'post_processor;
+        }
+
+        if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {
+            match decoded {
+                Ok(ctx) => {
+                    states
+                        .entry(request_id)
+                        .and_modify(|s| s.push(*&ctx.token.id))
+                        .or_insert_with(|| {
+                            let mut state = Vec::with_capacity(MAX_NUM_TOKENS);
+                            state.push(*&ctx.token.id);
+                            state
+                        });
+
+                    let out = match tokenizer.decode(&[ctx.token.id], false) {
+                        Ok(text) => {
+                            let is_special =
+                                tokenizer.get_added_vocabulary().is_special_token(&text);
+                            let token = Token {
+                                id: ctx.token.id,
+                                text,
+                                logprob: ctx.token.log_prob,
+                                special: is_special,
+                            };
+
+                            let out = if !ctx.token.is_final {
+                                InferStreamResponse::Intermediate {
+                                    token,
+                                    top_tokens: vec![],
+                                }
+                            } else {
+                                let tokens = states.remove(&request_id).unwrap();
+                                let text = tokenizer.decode(&tokens, true);
+                                let generated_text = GeneratedText {
+                                    text: text.unwrap(),
+                                    generated_tokens: tokens.len() as u32,
+                                    finish_reason: FinishReason::EndOfSequenceToken,
+                                    seed: None,
+                                };
+
+                                InferStreamResponse::End {
+                                    token,
+                                    top_tokens: vec![],
+                                    generated_text,
+                                    start: ctx.start.unwrap(),
+                                    queued: ctx.queued,
+                                }
+                            };
+
+                            Ok(out)
+                        }
+                        Err(err) => Err(GenerationError(err.to_string())),
+                    };
+
+                    if let Err(_) = ctx.channel.send(out) {
+                        warn!("Failed to send decoded token back to the user")
+                    }
+                }
+                Err(_err) => {
+                    todo!("what do we do?")
+                }
+            }
+        }
+    }
+}
+
+fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
+    engine_folder: P,
+    executor_worker_path: PP,
+) -> Result<(String, String), TensorRtLlmBackendError> {
+    // Retrieve paths as &str for the backend creation
+    let engine_folder = engine_folder.as_ref();
+    let executor_worker_path = executor_worker_path.as_ref();
+
+    // Ensure the engine folder exists
+    if !engine_folder.exists() {
+        let err = TensorRtLlmBackendError::EngineFolderDoesntExists(engine_folder.to_path_buf());
+
+        error!("Path validation failed: {}", err,);
+        return Err(err);
+    }
+
+    // Ensure executor worker binary exists
+    if !executor_worker_path.exists() {
+        let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf());
+
+        error!("Path validation failed: {}", err,);
+        return Err(err);
+    }
+
+    let engine_folder = String::from(
+        engine_folder
+            .to_str()
+            .expect("Failed to convert engine_folder to valid UTF-8"),
+    );
+
+    let executor_worker_path = String::from(
+        executor_worker_path
+            .to_str()
+            .expect("Failed to convert executor_worker_path to valid UTF-8"),
+    );
+
+    Ok((engine_folder, executor_worker_path))
+}
+
+unsafe impl Send for TensorRtLlmBackendImpl {}
+
+pub struct TensorRtLlmBackendV2 {
+    executor_looper: JoinHandle<()>,
+    post_processor_looper: JoinHandle<()>,
+    executor: UnboundedSender<GenerationContext>,
+}
+
+impl TensorRtLlmBackendV2 {
+    pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
+        tokenizer: Tokenizer,
+        engine_folder: P,
+        executor_worker_path: PP,
+        max_inflight_requests: usize,
+    ) -> Result<Self, TensorRtLlmBackendError> {
+        let (engine_folder, executor_worker_path) =
+            ensure_paths_exist(engine_folder, executor_worker_path)?;
+
+        // Allocate the IPC layer to communicate with the backend
+        let (executor_sender, executor_receiver) = unbounded_channel();
+        let (post_processor_sender, post_processor_receiver) = unbounded_channel();
+
+        // Create the FFI backend
+        let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
+            .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
+
+        // Executor looper is responsible for scheduling and pulling requests state at regular interval
+        let executor_looper = spawn_blocking(move || {
+            executor_status_looper(
+                backend,
+                max_inflight_requests,
+                executor_receiver,
+                post_processor_sender,
+            )
+        });
+
+        // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user
+        let post_processor_looper = spawn_blocking(move || {
+            post_processor_looper::<256>(tokenizer, max_inflight_requests, post_processor_receiver)
+        });
+
+        Ok(TensorRtLlmBackendV2 {
+            executor_looper,
+            post_processor_looper,
+            executor: executor_sender,
+        })
+    }
+
+    fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
+        if request.input_ids.is_none() {
+            return Err(ValidationError(UnsupportedModality("No token provided")));
+        }
+
+        if request.top_n_tokens > 1 {
+            return Err(ValidationError(TopNTokensDisabled));
+        }
+
+        // TODO: Is it really needed? How can it be validated before?
+        if request.parameters.grammar.is_some() {
+            return Err(ValidationError(Grammar));
+        }
+
+        match request.inputs.len() {
+            0 => Err(ValidationError(EmptyInput)),
+            2.. => Err(GenerationError(
+                "TensorRT-LLM backend don't support multi-chunk".into(),
+            )),
+            1 => match request.inputs.first().expect("Single item-chunk") {
+                Chunk::Text(_) => Ok(()),
+                Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))),
+            },
+        }
+    }
+}
+
+#[async_trait]
+impl Backend for TensorRtLlmBackendV2 {
+    fn schedule(
+        &self,
+        inner: ValidGenerateRequest,
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+        Self::validate(&inner)?;
+
+        // Open-up the stream to send tokens
+        let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();
+
+        // Send the context to the executor for scheduling
+        let queued = Instant::now();
+        match self.executor.send(GenerationContext {
+            request: inner,
+            start: None,
+            queued,
+            streamer,
+        }) {
+            Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
+            Err(_) => Err(GenerationError(
+                "Failed to submit request to the backend".into(),
+            )),
+        }
+    }
+
+    async fn health(&self, _: bool) -> bool {
+        !self.executor_looper.is_finished() & !self.post_processor_looper.is_finished()
+    }
+}
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..6a247fc1d5265b8b7b240e13169948e9be34eb0b
--- /dev/null
+++ b/backends/trtllm/src/main.rs
@@ -0,0 +1,302 @@
+use std::path::{Path, PathBuf};
+
+use clap::Parser;
+use hf_hub::api::tokio::{Api, ApiBuilder};
+use hf_hub::{Cache, Repo, RepoType};
+use tokenizers::Tokenizer;
+use tracing::info;
+
+use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
+use text_generation_backends_trtllm::TensorRtLlmBackendV2;
+use text_generation_router::server::get_base_tokenizer;
+use text_generation_router::usage_stats::UsageStatsLevel;
+use text_generation_router::{server, HubTokenizerConfig};
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[clap(default_value = "128", long, env)]
+    max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+    #[clap(default_value = "4096", long, env)]
+    max_batch_prefill_tokens: u32,
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<u32>,
+    #[clap(default_value = "0.0.0.0", long, env)]
+    hostname: String,
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+    #[clap(long, env, required = true)]
+    tokenizer_name: String,
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
+    revision: Option<String>,
+    #[clap(long, env)]
+    model_id: String,
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+    #[clap(long, env)]
+    json_output: bool,
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+    #[clap(long, env)]
+    auth_token: Option<String>,
+    #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
+    executor_worker: PathBuf,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
+}
+
+async fn get_tokenizer(
+    tokenizer_name: &str,
+    tokenizer_config_path: Option<&str>,
+    revision: Option<&str>,
+) -> Option<Tokenizer> {
+    // Parse Huggingface hub token
+    let authorization_token = std::env::var("HF_TOKEN")
+        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+        .ok();
+
+    // Tokenizer instance
+    let local_path = Path::new(tokenizer_name);
+
+    // Shared API builder initialization
+    let api_builder = || {
+        let mut builder = ApiBuilder::new()
+            .with_progress(false)
+            .with_token(authorization_token);
+
+        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
+            builder = builder.with_cache_dir(cache_dir.into());
+        }
+
+        builder
+    };
+
+    // Decide if we need to use the API based on the revision and local path
+    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
+
+    // Initialize API if needed
+    #[derive(Clone)]
+    enum Type {
+        Api(Api),
+        Cache(Cache),
+        None,
+    }
+    let api = if use_api {
+        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
+            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
+                .map_err(|_| ())
+                .map(|cache_dir| Cache::new(cache_dir.into()))
+                .unwrap_or_else(|_| Cache::default());
+            tracing::warn!("Offline mode active using cache defaults");
+            Type::Cache(cache)
+        } else {
+            tracing::info!("Using the Hugging Face API");
+            match api_builder().build() {
+                Ok(api) => Type::Api(api),
+                Err(_) => {
+                    tracing::warn!("Unable to build the Hugging Face API");
+                    Type::None
+                }
+            }
+        }
+    } else {
+        Type::None
+    };
+
+    // Load tokenizer and model info
+    let (
+        tokenizer_filename,
+        _config_filename,
+        tokenizer_config_filename,
+        _preprocessor_config_filename,
+        _processor_config_filename,
+    ) = match api {
+        Type::None => (
+            Some(local_path.join("tokenizer.json")),
+            Some(local_path.join("config.json")),
+            Some(local_path.join("tokenizer_config.json")),
+            Some(local_path.join("preprocessor_config.json")),
+            Some(local_path.join("processor_config.json")),
+        ),
+        Type::Api(api) => {
+            let api_repo = api.repo(Repo::with_revision(
+                tokenizer_name.to_string(),
+                RepoType::Model,
+                revision.unwrap_or_else(|| "main").to_string(),
+            ));
+
+            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
+                Ok(tokenizer_filename) => Some(tokenizer_filename),
+                Err(_) => get_base_tokenizer(&api, &api_repo).await,
+            };
+            let config_filename = api_repo.get("config.json").await.ok();
+            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
+            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
+            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
+
+            (
+                tokenizer_filename,
+                config_filename,
+                tokenizer_config_filename,
+                preprocessor_config_filename,
+                processor_config_filename,
+            )
+        }
+        Type::Cache(cache) => {
+            let repo = cache.repo(Repo::with_revision(
+                tokenizer_name.to_string(),
+                RepoType::Model,
+                revision.clone().unwrap_or_else(|| "main").to_string(),
+            ));
+            (
+                repo.get("tokenizer.json"),
+                repo.get("config.json"),
+                repo.get("tokenizer_config.json"),
+                repo.get("preprocessor_config.json"),
+                repo.get("processor_config.json"),
+            )
+        }
+    };
+
+    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
+    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
+    {
+        HubTokenizerConfig::from_file(filename)
+    } else {
+        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
+    };
+
+    tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok())
+}
+
+#[tokio::main]
+async fn main() -> Result<(), TensorRtLlmBackendError> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        hostname,
+        port,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        model_id,
+        validation_workers,
+        json_output,
+        otlp_endpoint,
+        otlp_service_name,
+        cors_allow_origin,
+        max_client_batch_size,
+        auth_token,
+        executor_worker,
+        usage_stats,
+    } = args;
+
+    // Launch Tokio runtime
+    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
+
+    // Validate args
+    if max_input_tokens >= max_total_tokens {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+
+    if validation_workers == 0 {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(
+            "`validation_workers` must be > 0".to_string(),
+        ));
+    }
+
+    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
+        if max_batch_prefill_tokens > *max_batch_total_tokens {
+            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+        }
+        if max_total_tokens as u32 > *max_batch_total_tokens {
+            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+        }
+    }
+
+    if !executor_worker.exists() {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(format!(
+            "`executor_work` specified path doesn't exists: {}",
+            executor_worker.display()
+        )));
+    }
+
+    // Create the backend
+    let tokenizer = get_tokenizer(
+        &tokenizer_name,
+        tokenizer_config_path.as_deref(),
+        revision.as_deref(),
+    )
+    .await
+    .expect("Failed to retrieve tokenizer implementation");
+
+    info!("Successfully retrieved tokenizer {}", &tokenizer_name);
+    let backend = TensorRtLlmBackendV2::new(
+        tokenizer,
+        model_id,
+        executor_worker,
+        max_concurrent_requests,
+    )?;
+
+    info!("Successfully created backend");
+
+    // Run server
+    server::run(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        auth_token,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        hostname,
+        port,
+        cors_allow_origin,
+        false,
+        None,
+        None,
+        true,
+        max_client_batch_size,
+        usage_stats,
+    )
+    .await?;
+    Ok(())
+}
diff --git a/backends/trtllm/src/utils.rs b/backends/trtllm/src/utils.rs
new file mode 100644
index 0000000000000000000000000000000000000000..4dedb0078632b43a5f3446fbb141a91cfc003867
--- /dev/null
+++ b/backends/trtllm/src/utils.rs
@@ -0,0 +1,22 @@
+///
+/// Extract the first line of the provided string reference.
+/// If there is no lines in the buffer, it returns a string
+/// which content is defined by the content of `fail`
+/// # Arguments
+///
+/// * `s`: The string buffer to extract the first-line from
+/// * `fail`: A string content which is returned if no lines are
+/// present in `s`
+///
+/// returns: String
+///
+/// # Examples
+///
+/// ```
+/// let s = "My name is Morgan.\n I'm working at Hugging Face.";
+/// first_line(s, "No line in string");
+/// ```
+#[inline]
+pub(crate) fn first_line(s: &str, fail: &str) -> String {
+    s.lines().next().unwrap_or(fail).to_string()
+}
diff --git a/backends/trtllm/tests/infer_test.cpp b/backends/trtllm/tests/infer_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8520065a759e22cf87908450d4a5f3a9526e7809
--- /dev/null
+++ b/backends/trtllm/tests/infer_test.cpp
@@ -0,0 +1,14 @@
+//
+// Created by mfuntowicz on 7/2/24.
+//
+#include <catch2/catch_all.hpp>
+#include <spdlog/spdlog.h>
+#include "../include/backend.h"
+
+TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
+    const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
+    const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
+
+    spdlog::info("Loading config from: {}", absolute(engines).string());
+    huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
+}
diff --git a/backends/v2/Cargo.toml b/backends/v2/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..4d32474e77f65d7fe0913e8177a1a13e739bdfc6
--- /dev/null
+++ b/backends/v2/Cargo.toml
@@ -0,0 +1,75 @@
+[package]
+name = "text-generation-router-v2"
+description = "Text Generation Webserver"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[lib]
+path = "src/lib.rs"
+
+[[bin]]
+name = "text-generation-router-v2"
+path = "src/main.rs"
+
+[dependencies]
+async-trait = "0.1.74"
+async-stream = "0.3.5"
+axum = { version = "0.7", features = ["json"] }
+axum-tracing-opentelemetry = "0.16"
+text-generation-router = { path = "../../router" }
+clap = { version = "4.4.5", features = ["derive", "env"] }
+grpc-metadata = { path = "../grpc-metadata" }
+futures = "0.3.28"
+hf-hub = { workspace = true }
+jsonschema = { version = "0.17.1", features = ["draft202012"] }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
+nohash-hasher = "0.2.0"
+opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
+opentelemetry-otlp = "0.13.0"
+rand = "0.8.5"
+reqwest = { version = "0.11.20", features = [] }
+serde = "1.0.188"
+serde_json = "1.0.107"
+slotmap = "1.0.7"
+thiserror = "1.0.48"
+tokenizers = { workspace = true }
+tokio = { version = "1.32.0", features = [
+  "rt",
+  "rt-multi-thread",
+  "parking_lot",
+  "signal",
+  "sync",
+] }
+tokio-stream = "0.1.14"
+tower-http = { version = "0.5.1", features = ["cors"] }
+tracing = "0.1.37"
+tracing-opentelemetry = "0.21.0"
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+utoipa = { version = "4.2.0", features = ["axum_extras"] }
+utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
+init-tracing-opentelemetry = { version = "0.14.1", features = [
+  "opentelemetry-otlp",
+] }
+minijinja = { workspace = true }
+minijinja-contrib = { workspace = true }
+futures-util = "0.3.30"
+regex = "1.10.3"
+once_cell = "1.19.0"
+image = "0.25.1"
+base64 = { workspace = true }
+prost = "^0.12"
+tonic = "^0.10"
+tower = "^0.4"
+
+[build-dependencies]
+tonic-build = "0.10.1"
+prost-build = "0.12.1"
+
+[features]
+default = ["ngrok"]
+ngrok = ["text-generation-router/ngrok"]
+google = ["text-generation-router/google"]
+kserve = ["text-generation-router/kserve"]
diff --git a/backends/v2/build.rs b/backends/v2/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..f1d85dc70ed09d9d47f5890da215b841834e6824
--- /dev/null
+++ b/backends/v2/build.rs
@@ -0,0 +1,19 @@
+use std::fs;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    println!("cargo:rerun-if-changed=../../proto/");
+
+    fs::create_dir_all("src/client/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/client/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
+        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+
+    Ok(())
+}
diff --git a/backends/v2/src/backend.rs b/backends/v2/src/backend.rs
new file mode 100644
index 0000000000000000000000000000000000000000..bc264138d28636abda4e5fe02ec956abf14fd8ba
--- /dev/null
+++ b/backends/v2/src/backend.rs
@@ -0,0 +1,502 @@
+use crate::client::{Batch, CachedBatch, ClientError, Generation, Health, ShardedClient};
+/// Batching and inference logic
+use crate::queue::{Entry, Queue};
+use async_trait::async_trait;
+use nohash_hasher::IntMap;
+use std::sync::Arc;
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidGenerateRequest;
+use text_generation_router::{FinishReason, PrefillToken, Token};
+use tokio::sync::mpsc::error::SendError;
+use tokio::sync::{mpsc, Notify};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{info_span, instrument, Instrument, Span};
+
+pub struct BackendV2 {
+    /// Request queue
+    queue: Queue,
+    /// Notify batcher on queue appends
+    batching_task_notifier: Arc<Notify>,
+    /// Client clone, used for health checks to skip the queue
+    client: ShardedClient,
+}
+
+impl BackendV2 {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        client: ShardedClient,
+        waiting_served_ratio: f32,
+        max_batch_prefill_tokens: u32,
+        max_batch_total_tokens: u32,
+        max_waiting_tokens: usize,
+        max_batch_size: Option<usize>,
+        requires_padding: bool,
+        window_size: Option<u32>,
+        speculate: u32,
+    ) -> Self {
+        // Infer shared state
+        let attention = std::env::var("ATTENTION").unwrap_or("paged".to_string());
+        let block_size = match attention.as_str() {
+            "flashinfer" => 1,
+            "flashdecoding" => 256,
+            "paged" => 16,
+            _ => unreachable!(),
+        };
+
+        let queue = Queue::new(requires_padding, block_size, window_size, speculate);
+        let batching_task_notifier = Arc::new(Notify::new());
+
+        // Spawn batching background task that contains all the inference logic
+        tokio::spawn(batching_task(
+            client.clone(),
+            waiting_served_ratio,
+            max_batch_prefill_tokens,
+            max_batch_total_tokens,
+            max_waiting_tokens,
+            max_batch_size,
+            queue.clone(),
+            batching_task_notifier.clone(),
+        ));
+
+        Self {
+            queue,
+            batching_task_notifier,
+            client,
+        }
+    }
+}
+
+#[async_trait]
+impl Backend for BackendV2 {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+        // MPSC channel to communicate with the background batching task
+        let (response_tx, response_rx) = mpsc::unbounded_channel();
+
+        // Append the request to the queue
+        self.queue.append(Entry {
+            request,
+            response_tx,
+            span: Span::current(),
+            temp_span: None,
+            queue_time: Instant::now(),
+            batch_time: None,
+        });
+
+        // Notify the background task that we have a new entry in the queue that needs
+        // to be batched
+        self.batching_task_notifier.notify_one();
+
+        // Return stream
+        Ok(UnboundedReceiverStream::new(response_rx))
+    }
+
+    async fn health(&self, current_health: bool) -> bool {
+        if current_health {
+            // Generation is healthy, we only check that the shards can allocate on device
+            self.client.device_health().await
+        } else {
+            self.client.model_health().await
+        }
+        .is_ok()
+    }
+}
+
+/// Batching logic
+/// Will be launched in a background Tokio task
+///
+/// Batches requests and sends them to the inference server
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn batching_task(
+    mut client: ShardedClient,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: u32,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+    queue: Queue,
+    notifier: Arc<Notify>,
+) {
+    // Infinite loop
+    loop {
+        // Wait for a notification from the Infer struct
+        notifier.notified().await;
+
+        // Get the next batch from the queue
+        // This batch might be smaller than the maximum batch size if there are not enough requests
+        // waiting in the queue
+        while let Some((mut entries, batch, span)) = queue
+            .next_batch(
+                None,
+                max_batch_size,
+                max_batch_prefill_tokens,
+                max_batch_total_tokens,
+            )
+            .await
+        {
+            let mut cached_batch = prefill(&mut client, batch, &mut entries)
+                .instrument(span)
+                .await;
+            let mut waiting_tokens = 1;
+
+            // We loop until we do not receive any cached batch from the inference server (== until
+            // all requests have met their stopping criteria)
+            while let Some(batch) = cached_batch {
+                // Get current batch info
+                let batch_size = batch.size;
+                let batch_max_tokens = batch.max_tokens;
+                let mut batches = vec![batch];
+                metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
+                metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
+
+                let min_size = if waiting_tokens >= max_waiting_tokens {
+                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
+                    // to add a new batch even though its size might be small
+                    None
+                } else {
+                    // Minimum batch size
+                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                };
+
+                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+                let max_size =
+                    max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize));
+                // Try to get a new batch
+                if let Some((mut new_entries, new_batch, span)) = queue
+                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
+                    .await
+                {
+                    // Tracking metrics
+                    if min_size.is_some() {
+                        metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
+                            .increment(1);
+                    } else {
+                        metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
+                            .increment(1);
+                    }
+
+                    entries.iter_mut().for_each(|(_, entry)| {
+                        // Create a new span to add the info that this entry is waiting
+                        // because a new batch is being computed
+                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
+                        // Add relationships
+                        span.follows_from(&entry_waiting_span);
+                        entry_waiting_span.follows_from(&span);
+                        // Update entry
+                        entry.temp_span = Some(entry_waiting_span);
+                    });
+
+                    // Generate one token for this new batch to have the attention past in cache
+                    let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
+                        .instrument(span)
+                        .await;
+                    // Reset waiting counter
+                    waiting_tokens = 1;
+                    // Extend current batch with the new batch
+                    if let Some(new_cached_batch) = new_cached_batch {
+                        entries.extend(new_entries);
+                        batches.push(new_cached_batch);
+                    }
+                }
+
+                // Create span for this batch to add context to inference calls
+                let next_batch_size = entries.len();
+                let next_batch_span =
+                    info_span!(parent: None, "batch", batch_size = next_batch_size);
+                entries.iter_mut().for_each(|(_, entry)| {
+                    // Create a new span to link the batch back to this entry
+                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
+                    // Add relationships
+                    next_batch_span.follows_from(&entry_batch_span);
+                    entry_batch_span.follows_from(&next_batch_span);
+                    // Update entry
+                    entry.temp_span = Some(entry_batch_span);
+                });
+
+                cached_batch = decode(&mut client, batches, &mut entries)
+                    .instrument(next_batch_span)
+                    .await;
+                waiting_tokens += 1;
+            }
+            metrics::gauge!("tgi_batch_current_size").set(0.0);
+            metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn prefill(
+    client: &mut ShardedClient,
+    batch: Batch,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_id = batch.id;
+    metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
+
+    match client.prefill(batch).await {
+        Ok((generations, next_batch, timings)) => {
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            metrics::histogram!("tgi_batch_forward_duration","method" => "prefill")
+                .record(timings.forward.as_secs_f64());
+            metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
+                .record(timings.decode.as_secs_f64());
+            metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
+                .record(start_filtering_time.elapsed().as_secs_f64());
+            metrics::histogram!("tgi_batch_inference_duration","method" => "prefill")
+                .record(start_time.elapsed().as_secs_f64());
+            metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            let _ = client.clear_cache(Some(batch_id)).await;
+            send_errors(err, entries);
+            metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
+            None
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn decode(
+    client: &mut ShardedClient,
+    batches: Vec<CachedBatch>,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
+    metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
+
+    match client.decode(batches).await {
+        Ok((generations, next_batch, timings)) => {
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            if let Some(concat_duration) = timings.concat {
+                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
+                    .record(concat_duration.as_secs_f64());
+            }
+            metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
+                .record(timings.forward.as_secs_f64());
+            metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
+                .record(timings.decode.as_secs_f64());
+            metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
+                .record(start_filtering_time.elapsed().as_secs_f64());
+            metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
+                .record(start_time.elapsed().as_secs_f64());
+            metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            for id in batch_ids {
+                let _ = client.clear_cache(Some(id)).await;
+            }
+            send_errors(err, entries);
+            metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
+            None
+        }
+    }
+}
+
+/// Filter a `batch` and remove all requests not present in `entries`
+#[instrument(skip_all)]
+async fn filter_batch(
+    client: &mut ShardedClient,
+    next_batch: Option<CachedBatch>,
+    entries: &IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let mut batch = next_batch?;
+
+    // No need to filter
+    if batch.size as usize == entries.len() {
+        return Some(batch);
+    }
+
+    let id = batch.id;
+
+    // Retain only requests that are still in entries
+    batch.request_ids.retain(|id| entries.contains_key(id));
+
+    if batch.request_ids.is_empty() {
+        // All requests have been filtered out
+        // Next batch is now empty
+        // Clear it from the Python shards cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.clear_cache(Some(id)).await.unwrap();
+        None
+    } else {
+        // Filter Python shard cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.filter_batch(id, batch.request_ids).await.unwrap()
+    }
+}
+
+/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
+/// and filter entries
+#[instrument(skip_all)]
+fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
+    generations.into_iter().for_each(|generation| {
+        let id = generation.request_id;
+        // Get entry
+        // We can `expect` here as the request id should always be in the entries
+        let entry = entries
+            .get(&id)
+            .expect("ID not found in entries. This is a bug.");
+
+        // Create and enter a span to link this function back to the entry
+        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
+        // Send generation responses back to the infer task
+        // If the receive an error from the Flume channel, it means that the client dropped the
+        // request and we need to stop generating hence why we unwrap_or(true)
+        let stopped = send_responses(generation, entry).inspect_err(|_err| {
+            tracing::error!("Entry response channel error.");
+            metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
+        }).unwrap_or(true);
+        if stopped {
+            entries.remove(&id).expect("ID not found in entries. This is a bug.");
+        }
+    });
+}
+
+/// Send responses through the `entry` response channel
+fn send_responses(
+    generation: Generation,
+    entry: &Entry,
+) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
+    // Return directly if the channel is disconnected
+    if entry.response_tx.is_closed() {
+        metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
+        return Ok(true);
+    }
+
+    let mut stopped = false;
+
+    if let Some(prefill_tokens) = generation.prefill_tokens {
+        // Create Token objects
+        // We do that here instead of in the Python code as Rust for loops are faster
+        let prefill_tokens = prefill_tokens
+            .ids
+            .into_iter()
+            .zip(prefill_tokens.logprobs)
+            .zip(prefill_tokens.texts)
+            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
+            .collect();
+
+        // Send message
+        entry
+            .response_tx
+            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
+    }
+
+    // Create last Token
+    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
+    let n = tokens_.ids.len();
+    metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
+    let mut iterator = tokens_
+        .ids
+        .into_iter()
+        .zip(tokens_.logprobs)
+        .zip(tokens_.texts)
+        .zip(tokens_.is_special)
+        .enumerate()
+        .peekable();
+    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
+        let token = Token {
+            id,
+            text,
+            logprob,
+            special,
+        };
+        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
+            top_tokens_
+                .ids
+                .iter()
+                .zip(top_tokens_.logprobs.iter())
+                .zip(top_tokens_.texts.iter())
+                .zip(top_tokens_.is_special.iter())
+                .map(|(((&id, &logprob), text), &special)| Token {
+                    id,
+                    text: text.to_string(),
+                    logprob,
+                    special,
+                })
+                .collect()
+        } else {
+            vec![]
+        };
+        match (&generation.generated_text, iterator.peek()) {
+            (Some(generated_text), None) => {
+                // Generation has ended
+                stopped = true;
+                // Send message
+                entry.response_tx.send(Ok(InferStreamResponse::End {
+                    token,
+                    top_tokens,
+                    generated_text: GeneratedText::from(generated_text.clone()),
+                    queued: entry.queue_time,
+                    start: entry.batch_time.unwrap(),
+                }))?;
+            }
+            _ => {
+                // Send message
+                entry
+                    .response_tx
+                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
+            }
+        }
+    }
+
+    Ok(stopped)
+}
+
+/// Send errors to Infer for all `entries`
+#[instrument(skip_all)]
+fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
+    entries.drain().for_each(|(_, entry)| {
+        // Create and enter a span to link this function back to the entry
+        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
+        let err = InferError::GenerationError(error.to_string());
+        metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
+        tracing::error!("{err}");
+
+        // unwrap_or is valid here as we don't care if the receiver is gone.
+        entry
+            .response_tx
+            .send(Err(err))
+            .unwrap_or(());
+    });
+}
+
+impl From<crate::client::GeneratedText> for GeneratedText {
+    fn from(value: crate::client::GeneratedText) -> Self {
+        let v2_finish_reason = crate::client::FinishReason::try_from(value.finish_reason).unwrap();
+        let finish_reason = match v2_finish_reason {
+            crate::client::FinishReason::Length => FinishReason::Length,
+            crate::client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
+            crate::client::FinishReason::StopSequence => FinishReason::StopSequence,
+        };
+
+        Self {
+            text: value.text,
+            generated_tokens: value.generated_tokens,
+            finish_reason,
+            seed: value.seed,
+        }
+    }
+}
diff --git a/backends/v2/src/client/grpc_client.rs b/backends/v2/src/client/grpc_client.rs
new file mode 100644
index 0000000000000000000000000000000000000000..b494352141cc1575854f5926ed70c17d6275dd45
--- /dev/null
+++ b/backends/v2/src/client/grpc_client.rs
@@ -0,0 +1,257 @@
+/// Single shard Client
+use crate::client::pb;
+use crate::client::{ClientError, Result, WARMUP_IMAGE_BASE64};
+use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v2::*;
+use std::cmp::min;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tracing::instrument;
+
+/// Text Generation Inference gRPC client
+#[derive(Debug, Clone)]
+pub struct Client {
+    stub: TextGenerationServiceClient<Channel>,
+}
+
+impl Client {
+    /// Returns a client connected to the given url
+    #[allow(dead_code)]
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let channel = Channel::builder(uri).connect().await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let channel = Channel::from_shared("http://[::]:50051".to_string())
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a list of uris or unix sockets of all shards
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v2 interface".to_string())
+        })?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            // Remove unix socket prefix
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+
+    /// Get model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let request = tonic::Request::new(InfoRequest {}).inject_context();
+        let response = self.stub.info(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
+        self.stub.clear_cache(request).await?;
+        Ok(())
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            request_ids,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip_all)]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let mut n_tokens = 0;
+        let mut requests = Vec::new();
+        // Create requests
+        while n_tokens < max_prefill_tokens {
+            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
+            requests.push(Request {
+                id: 0,
+                inputs,
+                // We truncate the input on the server side to be sure that it has the correct size
+                truncate,
+                // Set sampling parameters to also take these ops into account in the max memory
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 0.9,
+                    top_k: 10,
+                    top_p: 0.9,
+                    typical_p: 0.9,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
+                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens: max_total_tokens - truncate,
+                    stop_sequences: vec![],
+                    ignore_eos_token: true,
+                }),
+                prefill_logprobs: true,
+                top_n_tokens: 20,
+            });
+            n_tokens += max_input_length;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
+        }
+
+        let batch = Batch {
+            id: 0,
+            size: requests.len() as u32,
+            requests,
+            max_tokens: 0,
+        };
+
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
+        let response = self.stub.warmup(request).await?.into_inner();
+        Ok(response.max_supported_total_tokens)
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let response = self.stub.prefill(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
+        let response = self.stub.decode(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
diff --git a/backends/v2/src/client/mod.rs b/backends/v2/src/client/mod.rs
new file mode 100644
index 0000000000000000000000000000000000000000..fa9d440645d61f2d3ba9291c7528b121c1f4fcfc
--- /dev/null
+++ b/backends/v2/src/client/mod.rs
@@ -0,0 +1,68 @@
+//! Text Generation gRPC client library
+
+use async_trait::async_trait;
+use thiserror::Error;
+use tonic::transport;
+use tonic::Status;
+
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod grpc_client;
+mod sharded_client;
+
+pub use grpc_client::Client;
+pub use pb::generate::v2::{
+    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, HealthResponse,
+    InfoResponse, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+pub use sharded_client::ShardedClient;
+
+#[async_trait]
+pub trait Health {
+    /// Check if a generate server is healthy by asking it to allocate a tensor on device
+    async fn device_health(&self) -> Result<()>;
+
+    /// Check if a generate server is healthy by doing a forward pass.
+    /// EXPENSIVE
+    async fn model_health(&self) -> Result<()>;
+}
+
+#[derive(Debug)]
+pub struct ShardInfo {
+    pub requires_padding: bool,
+    pub dtype: String,
+    pub device_type: String,
+    pub window_size: Option<u32>,
+    pub speculate: u32,
+}
+
+#[derive(Error, Debug, Clone)]
+pub enum ClientError {
+    #[error("Could not connect to Text Generation server: {0}")]
+    Connection(String),
+    #[error("Server error: {0}")]
+    Generation(String),
+    #[error("Sharded results are empty")]
+    EmptyResults,
+}
+
+impl From<Status> for ClientError {
+    fn from(err: Status) -> Self {
+        let err = Self::Generation(err.message().to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+impl From<transport::Error> for ClientError {
+    fn from(err: transport::Error) -> Self {
+        let err = Self::Connection(err.to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+
+pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/backends/v2/src/client/sharded_client.rs b/backends/v2/src/client/sharded_client.rs
new file mode 100644
index 0000000000000000000000000000000000000000..eccf76d54670436a86d28cf80c275724f6f108a6
--- /dev/null
+++ b/backends/v2/src/client/sharded_client.rs
@@ -0,0 +1,252 @@
+/// Multi shard Client
+use crate::client::{ClientError, Result};
+use crate::client::{Health, ShardInfo};
+
+use crate::client::grpc_client::{DecodeTimings, PrefillTimings};
+use crate::client::InfoResponse;
+use crate::client::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+use async_trait::async_trait;
+use futures::future::join_all;
+use tonic::transport::Uri;
+use tracing::instrument;
+
+#[derive(Debug, Clone)]
+/// Text Generation Inference gRPC multi client
+pub struct ShardedClient {
+    clients: Vec<Client>,
+}
+
+impl ShardedClient {
+    fn new(clients: Vec<Client>) -> Self {
+        Self { clients }
+    }
+
+    /// Create a new ShardedClient from a master client. The master client will communicate with
+    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
+    async fn from_master_client(mut master_client: Client) -> Result<Self> {
+        // Get all uris/unix sockets from the master client
+        let uris = master_client.service_discovery().await?;
+        let futures = uris.into_iter().map(Client::connect_uds);
+        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        Ok(Self::new(clients?))
+    }
+
+    /// Returns a client connected to the given uri
+    #[allow(dead_code)]
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let master_client = Client::connect(uri).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let master_client = Client::connect_uds(path).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Get the model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<ShardInfo> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.info())
+            .collect();
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+    }
+
+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.clear_cache(batch_id))
+            .collect();
+        join_all(futures).await.into_iter().collect()
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip(self))]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
+            .collect();
+        // Take the minimum value
+        let results = join_all(futures)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<Option<u32>>>>()?;
+        Ok(results.into_iter().flatten().min())
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.decode(batches.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+}
+
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
+    }
+}
diff --git a/backends/v2/src/lib.rs b/backends/v2/src/lib.rs
new file mode 100644
index 0000000000000000000000000000000000000000..85c36931c5c027474692e7e6dd1480887e3e9966
--- /dev/null
+++ b/backends/v2/src/lib.rs
@@ -0,0 +1,141 @@
+mod backend;
+mod client;
+mod queue;
+
+use crate::client::{ClientError, ShardedClient};
+pub(crate) use backend::BackendV2;
+use serde::Serialize;
+use thiserror::Error;
+use utoipa::ToSchema;
+
+#[derive(Clone, Debug, Serialize, ToSchema)]
+pub struct BackendInfo {
+    /// Mandatory
+    #[schema(example = "cuda")]
+    pub model_device_type: String,
+    #[schema(example = "torch.float16")]
+    pub model_dtype: String,
+
+    /// Backend parameters
+    #[schema(example = "1")]
+    pub speculate: usize,
+    #[schema(example = "1.2")]
+    pub waiting_served_ratio: f32,
+    #[schema(example = "32000")]
+    pub max_batch_total_tokens: u32,
+    #[schema(example = "20")]
+    pub max_waiting_tokens: usize,
+    #[schema(nullable = true, example = "null")]
+    pub max_batch_size: Option<usize>,
+}
+
+#[allow(clippy::too_many_arguments)]
+pub async fn connect_backend(
+    max_input_tokens: usize,
+    max_total_tokens: usize,
+    master_shard_uds_path: String,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: Option<u32>,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+) -> Result<(BackendV2, BackendInfo), V2Error> {
+    // Helper function
+    let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
+        match max_supported_batch_total_tokens {
+            // Older models do not support automatic max-batch-total-tokens
+            None => {
+                let max_batch_total_tokens = max_batch_total_tokens
+                    .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)));
+                tracing::warn!("Model does not support automatic max batch total tokens");
+                Ok(max_batch_total_tokens)
+            }
+            // Flash attention models return their max supported total tokens
+            Some(max_supported_batch_total_tokens) => {
+                // Warn if user added his own max-batch-total-tokens as we will ignore it
+                if max_batch_total_tokens.is_some() {
+                    tracing::warn!(
+                        "`--max-batch-total-tokens` is deprecated for Flash \
+                        Attention models."
+                    );
+                    tracing::warn!(
+                        "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
+                    );
+                }
+                if max_total_tokens as u32 > max_supported_batch_total_tokens {
+                    return Err(V2Error::NotEnoughMemory(max_total_tokens));
+                }
+
+                Ok(max_supported_batch_total_tokens)
+            }
+        }
+    };
+
+    let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
+        .await
+        .map_err(V2Error::Connection)?;
+
+    // server is running on v2
+    // Clear the cache; useful if the webserver rebooted
+    sharded_client
+        .clear_cache(None)
+        .await
+        .map_err(V2Error::Cache)?;
+    // Get info from the shard
+    let shard_info = sharded_client.info().await.map_err(V2Error::Info)?;
+
+    // Warmup model
+    tracing::info!("Warming up model");
+    let max_batch_total_tokens = check_max_batch_total_tokens(
+        sharded_client
+            .warmup(
+                max_input_tokens as u32,
+                max_batch_prefill_tokens,
+                max_total_tokens as u32,
+                max_batch_size,
+            )
+            .await
+            .map_err(V2Error::Warmup)?,
+    )?;
+    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
+
+    let backend_info = BackendInfo {
+        waiting_served_ratio,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        model_device_type: shard_info.device_type.clone(),
+        model_dtype: shard_info.dtype.clone(),
+        speculate: shard_info.speculate as usize,
+    };
+
+    let backend = BackendV2::new(
+        sharded_client,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        shard_info.requires_padding,
+        shard_info.window_size,
+        shard_info.speculate,
+    );
+
+    tracing::info!("Using backend V3");
+
+    Ok((backend, backend_info))
+}
+
+#[derive(Debug, Error)]
+pub enum V2Error {
+    #[error("Unable to clear the Python model shards cache: {0}")]
+    Cache(ClientError),
+    #[error("Unable to connect to the Python model shards: {0}")]
+    Connection(ClientError),
+    #[error("Unable to get the Python model shards info: {0}")]
+    Info(ClientError),
+    #[error("Unable to warmup the Python model shards: {0}")]
+    Warmup(ClientError),
+    #[error("Not enough memory to handle `max_total_tokens={0}`")]
+    NotEnoughMemory(usize),
+}
diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..ab4b7ce1d97ca1f518b224528719928b9dce1812
--- /dev/null
+++ b/backends/v2/src/main.rs
@@ -0,0 +1,212 @@
+use clap::{Parser, Subcommand};
+use text_generation_router::{server, usage_stats};
+use text_generation_router_v2::{connect_backend, V2Error};
+use thiserror::Error;
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[command(subcommand)]
+    command: Option<Commands>,
+
+    #[clap(default_value = "128", long, env)]
+    max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+    #[clap(default_value = "1.2", long, env)]
+    waiting_served_ratio: f32,
+    #[clap(default_value = "4096", long, env)]
+    max_batch_prefill_tokens: u32,
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<u32>,
+    #[clap(default_value = "20", long, env)]
+    max_waiting_tokens: usize,
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
+    #[clap(default_value = "0.0.0.0", long, env)]
+    hostname: String,
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
+    master_shard_uds_path: String,
+    #[clap(default_value = "bigscience/bloom", long, env)]
+    tokenizer_name: String,
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
+    revision: Option<String>,
+    #[clap(long, env, value_enum)]
+    trust_remote_code: bool,
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+    #[clap(long, env)]
+    api_key: Option<String>,
+    #[clap(long, env)]
+    json_output: bool,
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+    #[clap(long, env)]
+    ngrok: bool,
+    #[clap(long, env)]
+    ngrok_authtoken: Option<String>,
+    #[clap(long, env)]
+    ngrok_edge: Option<String>,
+    #[clap(long, env, default_value_t = false)]
+    disable_grammar_support: bool,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
+}
+
+#[derive(Debug, Subcommand)]
+enum Commands {
+    PrintSchema,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), RouterError> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        command,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        hostname,
+        port,
+        master_shard_uds_path,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        trust_remote_code,
+        validation_workers,
+        api_key,
+        json_output,
+        otlp_endpoint,
+        otlp_service_name,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    } = args;
+
+    if let Some(Commands::PrintSchema) = command {
+        use utoipa::OpenApi;
+        let api_doc = text_generation_router::server::ApiDoc::openapi();
+        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
+        println!("{}", api_doc);
+        std::process::exit(0);
+    };
+    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
+
+    // Validate args
+    if max_input_tokens >= max_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+
+    if validation_workers == 0 {
+        return Err(RouterError::ArgumentValidation(
+            "`validation_workers` must be > 0".to_string(),
+        ));
+    }
+
+    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
+        if max_batch_prefill_tokens > *max_batch_total_tokens {
+            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+        }
+        if max_total_tokens as u32 > *max_batch_total_tokens {
+            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+        }
+    }
+
+    if let Some(max_batch_size) = max_batch_size {
+        if max_batch_size == 0 {
+            return Err(RouterError::ArgumentValidation(
+                "`max_batch_size` must be > 0".to_string(),
+            ));
+        }
+    }
+
+    let (backend, _backend_info) = connect_backend(
+        max_input_tokens,
+        max_total_tokens,
+        master_shard_uds_path,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+    )
+    .await?;
+
+    // Run server
+    server::run(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        api_key,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        trust_remote_code,
+        hostname,
+        port,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    )
+    .await?;
+    Ok(())
+}
+
+#[derive(Debug, Error)]
+enum RouterError {
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("Backend failed: {0}")]
+    Backend(#[from] V2Error),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Tokio runtime failed to start: {0}")]
+    Tokio(#[from] std::io::Error),
+}
diff --git a/router/src/infer/v2/queue.rs b/backends/v2/src/queue.rs
similarity index 95%
rename from router/src/infer/v2/queue.rs
rename to backends/v2/src/queue.rs
index 93cf94699d82e1a5af18bc665209d09601b70974..bf52900f497fce3f28e4e0475552c3e7b18af3e5 100644
--- a/router/src/infer/v2/queue.rs
+++ b/backends/v2/src/queue.rs
@@ -1,14 +1,14 @@
-use crate::infer::{InferError, InferStreamResponse};
-use crate::validation::{
-    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
+use crate::client::{
+    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
 };
 use nohash_hasher::{BuildNoHashHasher, IntMap};
 use std::cmp::min;
 use std::collections::VecDeque;
-use text_generation_client::v2::{
-    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+use text_generation_router::infer::InferError;
+use text_generation_router::infer::InferStreamResponse;
+use text_generation_router::validation::{
+    ChunksToString, ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
 };
-use text_generation_client::ChunksToString;
 use tokio::sync::{mpsc, oneshot};
 use tokio::time::Instant;
 use tracing::{info_span, instrument, Span};
@@ -111,7 +111,7 @@ async fn queue_task(
         match cmd {
             QueueCommand::Append(entry, span) => {
                 span.in_scope(|| state.append(*entry));
-                metrics::increment_gauge!("tgi_queue_size", 1.0);
+                metrics::gauge!("tgi_queue_size").increment(1.0);
             }
             QueueCommand::NextBatch {
                 min_size,
@@ -124,7 +124,7 @@ async fn queue_task(
                 let next_batch =
                     state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
                 response_sender.send(next_batch).unwrap();
-                metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
+                metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64);
             }),
         }
     }
@@ -205,13 +205,20 @@ impl State {
             }
         }
 
+        if let Some(max_size) = max_size {
+            if max_size == 0 {
+                tracing::debug!("No capacity");
+                return None;
+            }
+        }
+
         // Pad prefill_token_budget to be a multiple of block size
         let prefill_token_budget =
             ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
 
         // Create span for this batch to add context to inference calls
         let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
-        next_batch_span.follows_from(&Span::current());
+        next_batch_span.follows_from(Span::current());
 
         let mut batch_requests = Vec::with_capacity(self.entries.len());
         let mut batch_entries =
@@ -226,7 +233,7 @@ impl State {
             // Filter entries where the response receiver was dropped (== entries where the request
             // was dropped by the client)
             if entry.response_tx.is_closed() {
-                metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+                metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
                 tracing::debug!("Dropping entry");
                 continue;
             }
@@ -336,7 +343,7 @@ impl State {
         // Increment batch id
         self.next_batch_id += 1;
 
-        metrics::histogram!("tgi_batch_next_size", batch.size as f64);
+        metrics::histogram!("tgi_batch_next_size").record(batch.size as f64);
 
         Some((batch_entries, batch, next_batch_span))
     }
@@ -397,6 +404,7 @@ impl From<ValidStoppingParameters> for StoppingCriteriaParameters {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::sync::Arc;
     use tracing::info_span;
 
     fn default_entry() -> (
@@ -408,7 +416,9 @@ mod tests {
         let entry = Entry {
             request: ValidGenerateRequest {
                 inputs: vec![],
+                input_ids: Some(Arc::new(vec![])),
                 input_length: 0,
+                add_special_tokens: true,
                 truncate: 0,
                 decoder_input_details: false,
                 parameters: ValidParameters {
diff --git a/backends/v3/Cargo.toml b/backends/v3/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..69dad072fac03715f024f4b345cefffaa8d652c3
--- /dev/null
+++ b/backends/v3/Cargo.toml
@@ -0,0 +1,83 @@
+[package]
+name = "text-generation-router-v3"
+description = "Text Generation Webserver"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[lib]
+path = "src/lib.rs"
+
+[[bin]]
+name = "text-generation-router"
+path = "src/main.rs"
+
+[dependencies]
+async-trait = "0.1.74"
+async-stream = "0.3.5"
+axum = { version = "0.7", features = ["json"] }
+axum-tracing-opentelemetry = "0.16"
+text-generation-router = { path = "../../router" }
+clap = { version = "4.4.5", features = ["derive", "env"] }
+grpc-metadata = { path = "../grpc-metadata" }
+futures = "0.3.28"
+hf-hub = { workspace = true }
+jsonschema = { version = "0.17.1", features = ["draft202012"] }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
+nohash-hasher = "0.2.0"
+opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
+opentelemetry-otlp = "0.13.0"
+rand = "0.8.5"
+reqwest = { version = "0.11.20", features = [] }
+serde = "1.0.188"
+serde_json = "1.0.107"
+slotmap = "1.0.7"
+thiserror = "1.0.48"
+tokenizers = { workspace = true }
+tokio = { version = "1.32.0", features = [
+  "rt",
+  "rt-multi-thread",
+  "parking_lot",
+  "signal",
+  "sync",
+] }
+tokio-stream = "0.1.14"
+tower-http = { version = "0.5.1", features = ["cors"] }
+tracing = "0.1.37"
+tracing-opentelemetry = "0.21.0"
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+utoipa = { version = "4.2.0", features = ["axum_extras"] }
+utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
+init-tracing-opentelemetry = { version = "0.14.1", features = [
+  "opentelemetry-otlp",
+] }
+minijinja = { workspace = true }
+minijinja-contrib = { workspace = true }
+futures-util = "0.3.30"
+regex = "1.10.3"
+once_cell = "1.19.0"
+image = "0.25.1"
+base64 = { workspace = true }
+prost = "^0.12"
+tonic = "^0.10"
+tower = "^0.4"
+
+[build-dependencies]
+tonic-build = "0.10.1"
+prost-build = "0.12.1"
+
+[dev-dependencies]
+criterion = "0.3"
+itertools = "0.13"
+
+[features]
+default = ["ngrok"]
+ngrok = ["text-generation-router/ngrok"]
+google = ["text-generation-router/google"]
+kserve = ["text-generation-router/kserve"]
+
+[[bench]]
+name = "prefix_cache"
+harness = false
diff --git a/backends/v3/benches/prefix_cache.rs b/backends/v3/benches/prefix_cache.rs
new file mode 100644
index 0000000000000000000000000000000000000000..d9df45b231bf84a81d483f25e43171316906094b
--- /dev/null
+++ b/backends/v3/benches/prefix_cache.rs
@@ -0,0 +1,47 @@
+use std::sync::Arc;
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use rand::Rng;
+
+use text_generation_router_v3::block_allocator::Allocator;
+use text_generation_router_v3::radix::RadixAllocator;
+
+fn prefix_cache_benchmark(c: &mut Criterion) {
+    // let prefixes: Vec<Vec<u32>> = (0..8192)
+    //     .chunks(256)
+    //     .into_iter()
+    //     .map(|c| c.collect())
+    //     .collect();
+
+    let mut cache = RadixAllocator::new(1, 262144, None);
+
+    c.bench_function("Radix allocator", |b| {
+        b.iter_batched(
+            || {
+                //prefixes
+                //    .choose_multiple(&mut rand::thread_rng(), 5)
+                //    .fold(Vec::new(), |mut v, s| {
+                //        v.extend(s);
+                //        v
+                //    })
+
+                (0..7936)
+                    .map(|_| rand::thread_rng().gen_range(0..1024))
+                    .collect::<Vec<u32>>()
+            },
+            |prefill| {
+                let alloc = cache.allocate(
+                    prefill.len() as u32 + 13,
+                    Some(Arc::new(black_box(prefill))),
+                );
+                if let Some(alloc) = alloc {
+                    cache.free(alloc.blocks.clone(), alloc.allocation_id);
+                }
+            },
+            criterion::BatchSize::SmallInput,
+        );
+    });
+}
+
+criterion_group!(benches, prefix_cache_benchmark);
+criterion_main!(benches);
diff --git a/backends/v3/build.rs b/backends/v3/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..6d702d14423bd770000a69af9cb9cce7be7fd828
--- /dev/null
+++ b/backends/v3/build.rs
@@ -0,0 +1,19 @@
+use std::fs;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    println!("cargo:rerun-if-changed=../../proto/");
+
+    fs::create_dir_all("src/client/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/client/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
+        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+
+    Ok(())
+}
diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs
new file mode 100644
index 0000000000000000000000000000000000000000..a5c0f5125b25ca8b1d3e284e9efccb82b310b706
--- /dev/null
+++ b/backends/v3/src/backend.rs
@@ -0,0 +1,548 @@
+/// Batching and inference logic
+use crate::client::{
+    Batch, CachedBatch, ClientError, Generation, Health, InfoResponse, ShardedClient,
+};
+use crate::queue::{Entry, Queue};
+use async_trait::async_trait;
+use nohash_hasher::IntMap;
+use std::sync::Arc;
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidGenerateRequest;
+use text_generation_router::{FinishReason, PrefillToken, Token};
+use tokio::sync::mpsc::error::SendError;
+use tokio::sync::{mpsc, Notify};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{info_span, instrument, Instrument, Span};
+
+pub struct BackendV3 {
+    /// Request queue
+    queue: Queue,
+    /// Notify batcher on queue appends
+    batching_task_notifier: Arc<Notify>,
+    /// Client clone, used for health checks to skip the queue
+    client: ShardedClient,
+}
+
+impl BackendV3 {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        client: ShardedClient,
+        waiting_served_ratio: f32,
+        max_batch_prefill_tokens: u32,
+        max_batch_total_tokens: u32,
+        max_waiting_tokens: usize,
+        max_batch_size: Option<usize>,
+        shard_info: InfoResponse,
+    ) -> Self {
+        if shard_info.support_chunking {
+            tracing::warn!("Model supports prefill chunking. `waiting_served_ratio` and `max_waiting_tokens` will be ignored.");
+        }
+
+        let block_size = shard_info.block_size;
+
+        let queue = Queue::new(
+            shard_info.requires_padding,
+            block_size,
+            shard_info.use_prefix_caching,
+            shard_info.window_size,
+            shard_info.speculate,
+            max_batch_total_tokens,
+            shard_info.support_chunking,
+        );
+        let batching_task_notifier = Arc::new(Notify::new());
+
+        // Spawn batching background task that contains all the inference logic
+        tokio::spawn(batching_task(
+            client.clone(),
+            waiting_served_ratio,
+            max_batch_prefill_tokens,
+            max_batch_total_tokens,
+            max_waiting_tokens,
+            max_batch_size,
+            shard_info.support_chunking,
+            queue.clone(),
+            batching_task_notifier.clone(),
+        ));
+
+        Self {
+            queue,
+            batching_task_notifier,
+            client,
+        }
+    }
+}
+
+#[async_trait]
+impl Backend for BackendV3 {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+        // MPSC channel to communicate with the background batching task
+        let (response_tx, response_rx) = mpsc::unbounded_channel();
+
+        // Append the request to the queue
+        self.queue.append(Entry {
+            request,
+            response_tx,
+            span: Span::current(),
+            temp_span: None,
+            queue_time: Instant::now(),
+            batch_time: None,
+            block_allocation: None,
+        });
+
+        // Notify the background task that we have a new entry in the queue that needs
+        // to be batched
+        self.batching_task_notifier.notify_one();
+
+        // Return stream
+        Ok(UnboundedReceiverStream::new(response_rx))
+    }
+
+    async fn health(&self, current_health: bool) -> bool {
+        if current_health {
+            // Generation is healthy, we only check that the shards can allocate on device
+            self.client.device_health().await
+        } else {
+            self.client.model_health().await
+        }
+        .is_ok()
+    }
+}
+
+/// Batching logic
+/// Will be launched in a background Tokio task
+///
+/// Batches requests and sends them to the inference server
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn batching_task(
+    mut client: ShardedClient,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: u32,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+    support_chunking: bool,
+    queue: Queue,
+    notifier: Arc<Notify>,
+) {
+    // Infinite loop
+    loop {
+        // Wait for a notification from the Infer struct
+        notifier.notified().await;
+
+        // Get the next batch from the queue
+        // This batch might be smaller than the maximum batch size if there are not enough requests
+        // waiting in the queue
+        while let Some((mut entries, batch, span)) = queue
+            .next_batch(
+                None,
+                max_batch_size,
+                max_batch_prefill_tokens,
+                max_batch_total_tokens,
+            )
+            .await
+        {
+            let mut cached_batch = prefill(&mut client, batch, None, &mut entries)
+                .instrument(span)
+                .await;
+            let mut waiting_tokens = 1;
+
+            // We loop until we do not receive any cached batch from the inference server (== until
+            // all requests have met their stopping criteria)
+            while let Some(batch) = cached_batch {
+                // Get current batch info
+                let batch_size = batch.size;
+                let batch_max_tokens = batch.max_tokens;
+                let current_tokens = batch.current_tokens;
+                let mut batches = vec![batch];
+                metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
+                metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
+
+                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+
+                let (min_size, max_size, prefill_token_budget) = if support_chunking {
+                    // Since the next batch will be concatenated with the current batch,
+                    // the current batch tokens must be subtracted to the prefill budget
+                    let prefill_token_budget =
+                        max_batch_prefill_tokens.saturating_sub(current_tokens);
+                    // We can ignore min_size and max_size
+                    // Models than rely on max_size cannot support chunking
+                    // Regarding min_size, chunking allow us to consistently run at the compute
+                    // bound, making min_size useless.
+                    (None, None, prefill_token_budget)
+                } else {
+                    let min_size = if waiting_tokens >= max_waiting_tokens {
+                        // If we didn't onboard any new requests since >= max_waiting_tokens, we try
+                        // to add a new batch even though its size might be small
+                        None
+                    } else {
+                        // Minimum batch size
+                        // TODO: temporarily disable to avoid incorrect deallocation +
+                        //       reallocation when using prefix caching.
+                        Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    };
+
+                    let max_size =
+                        max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize));
+
+                    (min_size, max_size, max_batch_prefill_tokens)
+                };
+
+                // Try to get a new batch
+                if let Some((new_entries, new_batch, span)) = queue
+                    .next_batch(min_size, max_size, prefill_token_budget, token_budget)
+                    .await
+                {
+                    // Tracking metrics
+                    if min_size.is_some() {
+                        metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
+                            .increment(1);
+                    } else {
+                        let counter = if support_chunking {
+                            metrics::counter!("tgi_batch_concat", "reason" => "chunking")
+                        } else {
+                            metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
+                        };
+                        counter.increment(1);
+                    }
+                    let cached_batch = if support_chunking {
+                        // Concat current batch to the new one
+                        batches.pop()
+                    } else {
+                        // Request are waiting only if we don't support chunking
+                        entries.iter_mut().for_each(|(_, entry)| {
+                            // Create a new span to add the info that this entry is waiting
+                            // because a new batch is being computed
+                            let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
+                            // Add relationships
+                            span.follows_from(&entry_waiting_span);
+                            entry_waiting_span.follows_from(&span);
+                            // Update entry
+                            entry.temp_span = Some(entry_waiting_span);
+                        });
+                        None
+                    };
+                    entries.extend(new_entries);
+
+                    // Generate one token for this new batch to have the attention past in cache
+                    let new_cached_batch =
+                        prefill(&mut client, new_batch, cached_batch, &mut entries)
+                            .instrument(span)
+                            .await;
+                    // Reset waiting counter
+                    waiting_tokens = 1;
+                    // Extend current batch with the new batch
+                    if let Some(new_cached_batch) = new_cached_batch {
+                        batches.push(new_cached_batch);
+                    } else if support_chunking {
+                        // New cached batch is empty, no work left
+                        break;
+                    }
+                }
+
+                // Create span for this batch to add context to inference calls
+                let next_batch_size = entries.len();
+                let next_batch_span =
+                    info_span!(parent: None, "batch", batch_size = next_batch_size);
+                entries.iter_mut().for_each(|(_, entry)| {
+                    // Create a new span to link the batch back to this entry
+                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
+                    // Add relationships
+                    next_batch_span.follows_from(&entry_batch_span);
+                    entry_batch_span.follows_from(&next_batch_span);
+                    // Update entry
+                    entry.temp_span = Some(entry_batch_span);
+                });
+
+                cached_batch = decode(&mut client, batches, &mut entries)
+                    .instrument(next_batch_span)
+                    .await;
+                waiting_tokens += 1;
+            }
+            metrics::gauge!("tgi_batch_current_size").set(0.0);
+            metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn prefill(
+    client: &mut ShardedClient,
+    batch: Batch,
+    cached_batch: Option<CachedBatch>,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_id = batch.id;
+    metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
+
+    match client.prefill(batch, cached_batch).await {
+        Ok((generations, next_batch, timings)) => {
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            if let Some(concat_duration) = timings.concat {
+                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
+                    .record(concat_duration.as_secs_f64());
+            }
+            metrics::histogram!("tgi_batch_forward_duration", "method" => "prefill")
+                .record(timings.forward.as_secs_f64());
+            metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
+                .record(timings.decode.as_secs_f64());
+            metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
+                .record(start_filtering_time.elapsed().as_secs_f64());
+            metrics::histogram!("tgi_batch_inference_duration", "method" => "prefill")
+                .record(start_time.elapsed().as_secs_f64());
+            metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            let _ = client.clear_cache(Some(batch_id)).await;
+            send_errors(err, entries);
+            metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
+            None
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn decode(
+    client: &mut ShardedClient,
+    batches: Vec<CachedBatch>,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
+    metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
+
+    match client.decode(batches).await {
+        Ok((generations, next_batch, timings)) => {
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            if let Some(concat_duration) = timings.concat {
+                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
+                    .record(concat_duration.as_secs_f64());
+            }
+            metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
+                .record(timings.forward.as_secs_f64());
+            metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
+                .record(timings.decode.as_secs_f64());
+            metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
+                .record(start_filtering_time.elapsed().as_secs_f64());
+            metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
+                .record(start_time.elapsed().as_secs_f64());
+            metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            for id in batch_ids {
+                let _ = client.clear_cache(Some(id)).await;
+            }
+            send_errors(err, entries);
+            metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
+            None
+        }
+    }
+}
+
+/// Filter a `batch` and remove all requests not present in `entries`
+#[instrument(skip_all)]
+async fn filter_batch(
+    client: &mut ShardedClient,
+    next_batch: Option<CachedBatch>,
+    entries: &IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let mut batch = next_batch?;
+
+    // No need to filter
+    if batch.size as usize == entries.len() {
+        return Some(batch);
+    }
+
+    let id = batch.id;
+
+    // Retain only requests that are still in entries
+    batch.request_ids.retain(|id| entries.contains_key(id));
+
+    if batch.request_ids.is_empty() {
+        // All requests have been filtered out
+        // Next batch is now empty
+        // Clear it from the Python shards cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.clear_cache(Some(id)).await.unwrap();
+        None
+    } else {
+        // Filter Python shard cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.filter_batch(id, batch.request_ids).await.unwrap()
+    }
+}
+
+/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
+/// and filter entries
+#[instrument(skip_all)]
+fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
+    generations.into_iter().for_each(|generation| {
+        let id = generation.request_id;
+        // Get entry
+        // We can `expect` here as the request id should always be in the entries
+        let entry = entries
+            .get(&id)
+            .expect("ID not found in entries. This is a bug.");
+
+        // Create and enter a span to link this function back to the entry
+        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
+        // Send generation responses back to the infer task
+        // If the receive an error from the Flume channel, it means that the client dropped the
+        // request and we need to stop generating hence why we unwrap_or(true)
+        let stopped = send_responses(generation, entry).inspect_err(|_err| {
+            tracing::error!("Entry response channel error.");
+            metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
+        }).unwrap_or(true);
+        if stopped {
+            entries.remove(&id).expect("ID not found in entries. This is a bug.");
+        }
+    });
+}
+
+/// Send responses through the `entry` response channel
+fn send_responses(
+    generation: Generation,
+    entry: &Entry,
+) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
+    // Return directly if the channel is disconnected
+    if entry.response_tx.is_closed() {
+        metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
+        return Ok(true);
+    }
+
+    let mut stopped = false;
+
+    if let Some(prefill_tokens) = generation.prefill_tokens {
+        // Create Token objects
+        // We do that here instead of in the Python code as Rust for loops are faster
+        let prefill_tokens = prefill_tokens
+            .ids
+            .into_iter()
+            .zip(prefill_tokens.logprobs)
+            .zip(prefill_tokens.texts)
+            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
+            .collect();
+
+        // Send message
+        entry
+            .response_tx
+            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
+    }
+
+    // Create last Token
+    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
+    let n = tokens_.ids.len();
+    metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
+    let mut iterator = tokens_
+        .ids
+        .into_iter()
+        .zip(tokens_.logprobs)
+        .zip(tokens_.texts)
+        .zip(tokens_.is_special)
+        .enumerate()
+        .peekable();
+    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
+        let token = Token {
+            id,
+            text,
+            logprob,
+            special,
+        };
+        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
+            top_tokens_
+                .ids
+                .iter()
+                .zip(top_tokens_.logprobs.iter())
+                .zip(top_tokens_.texts.iter())
+                .zip(top_tokens_.is_special.iter())
+                .map(|(((&id, &logprob), text), &special)| Token {
+                    id,
+                    text: text.to_string(),
+                    logprob,
+                    special,
+                })
+                .collect()
+        } else {
+            vec![]
+        };
+        match (&generation.generated_text, iterator.peek()) {
+            (Some(generated_text), None) => {
+                // Generation has ended
+                stopped = true;
+                // Send message
+                entry.response_tx.send(Ok(InferStreamResponse::End {
+                    token,
+                    top_tokens,
+                    generated_text: GeneratedText::from(generated_text.clone()),
+                    queued: entry.queue_time,
+                    start: entry.batch_time.unwrap(),
+                }))?;
+            }
+            _ => {
+                // Send message
+                entry
+                    .response_tx
+                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
+            }
+        }
+    }
+
+    Ok(stopped)
+}
+
+/// Send errors to Infer for all `entries`
+#[instrument(skip_all)]
+fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
+    entries.drain().for_each(|(_, entry)| {
+        // Create and enter a span to link this function back to the entry
+        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
+        let err = InferError::GenerationError(error.to_string());
+        metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
+        tracing::error!("{err}");
+
+        // unwrap_or is valid here as we don't care if the receiver is gone.
+        entry
+            .response_tx
+            .send(Err(err))
+            .unwrap_or(());
+    });
+}
+
+impl From<crate::client::GeneratedText> for GeneratedText {
+    fn from(value: crate::client::GeneratedText) -> Self {
+        let v3_finish_reason = crate::client::FinishReason::try_from(value.finish_reason).unwrap();
+        let finish_reason = match v3_finish_reason {
+            crate::client::FinishReason::Length => FinishReason::Length,
+            crate::client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
+            crate::client::FinishReason::StopSequence => FinishReason::StopSequence,
+        };
+
+        Self {
+            text: value.text,
+            generated_tokens: value.generated_tokens,
+            finish_reason,
+            seed: value.seed,
+        }
+    }
+}
diff --git a/backends/v3/src/block_allocator.rs b/backends/v3/src/block_allocator.rs
new file mode 100644
index 0000000000000000000000000000000000000000..4fea172b65a062d0d6b130ffcfdc5815a25d289f
--- /dev/null
+++ b/backends/v3/src/block_allocator.rs
@@ -0,0 +1,209 @@
+use std::sync::Arc;
+use tokio::sync::{mpsc, oneshot};
+
+use crate::radix::RadixAllocator;
+
+#[derive(Debug, Clone)]
+pub struct BlockAllocation {
+    pub allocation_id: u64,
+    pub blocks: Vec<u32>,
+    pub slots: Vec<u32>,
+
+    /// Prefix that was cached and for which the KV does not have to
+    /// be recomputed.
+    pub prefix_len: u32,
+
+    pub(crate) block_allocator: Option<BlockAllocator>,
+}
+
+impl Drop for BlockAllocation {
+    fn drop(&mut self) {
+        if let Some(block_allocator) = self.block_allocator.as_mut() {
+            block_allocator.free(self.blocks.clone(), self.allocation_id)
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct BlockAllocator {
+    /// Channel to communicate with the background task
+    block_allocator: mpsc::UnboundedSender<BlockAllocatorCommand>,
+}
+
+impl BlockAllocator {
+    pub(crate) fn new(
+        max_batch_total_tokens: u32,
+        block_size: u32,
+        prefix_caching: bool,
+        window_size: Option<u32>,
+    ) -> Self {
+        // Create channel
+        let (sender, receiver) = mpsc::unbounded_channel();
+
+        // Launch background queue task
+        tokio::spawn(block_allocator_task(
+            max_batch_total_tokens / block_size,
+            block_size,
+            prefix_caching,
+            window_size,
+            receiver,
+        ));
+
+        Self {
+            block_allocator: sender,
+        }
+    }
+
+    pub(crate) async fn allocate(
+        &self,
+        tokens: u32,
+        prefill_tokens: Option<Arc<Vec<u32>>>,
+    ) -> Option<BlockAllocation> {
+        let (response_sender, response_receiver) = oneshot::channel();
+        self.block_allocator
+            .send(BlockAllocatorCommand::Allocate {
+                tokens,
+                prefill_tokens,
+                response_sender,
+            })
+            .unwrap();
+
+        response_receiver.await.unwrap().map(|mut allocation| {
+            allocation.block_allocator = Some(self.clone());
+            allocation
+        })
+    }
+
+    pub(crate) fn free(&self, blocks: Vec<u32>, allocation_id: u64) {
+        self.block_allocator
+            .send(BlockAllocatorCommand::Free {
+                allocation_id,
+                blocks,
+            })
+            .unwrap();
+    }
+}
+
+async fn block_allocator_task(
+    blocks: u32,
+    block_size: u32,
+    prefix_caching: bool,
+    window_size: Option<u32>,
+    mut receiver: mpsc::UnboundedReceiver<BlockAllocatorCommand>,
+) {
+    let mut allocator: Box<dyn Allocator + Send> = if prefix_caching {
+        Box::new(RadixAllocator::new(block_size, blocks, window_size))
+    } else {
+        Box::new(SimpleAllocator::new(blocks, block_size, window_size))
+    };
+    while let Some(cmd) = receiver.recv().await {
+        match cmd {
+            BlockAllocatorCommand::Free {
+                blocks,
+                allocation_id,
+            } => allocator.free(blocks, allocation_id),
+            BlockAllocatorCommand::Allocate {
+                tokens,
+                prefill_tokens,
+                response_sender,
+            } => {
+                response_sender
+                    .send(allocator.allocate(tokens, prefill_tokens))
+                    .unwrap();
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+enum BlockAllocatorCommand {
+    Free {
+        blocks: Vec<u32>,
+        allocation_id: u64,
+    },
+    Allocate {
+        tokens: u32,
+        prefill_tokens: Option<Arc<Vec<u32>>>,
+        response_sender: oneshot::Sender<Option<BlockAllocation>>,
+    },
+}
+
+pub trait Allocator {
+    fn allocate(
+        &mut self,
+        tokens: u32,
+        prefill_tokens: Option<Arc<Vec<u32>>>,
+    ) -> Option<BlockAllocation>;
+
+    fn free(&mut self, blocks: Vec<u32>, allocation_id: u64);
+}
+pub struct SimpleAllocator {
+    free_blocks: Vec<u32>,
+    block_size: u32,
+    window_size: Option<u32>,
+}
+
+impl SimpleAllocator {
+    fn new(blocks: u32, block_size: u32, window_size: Option<u32>) -> Self {
+        SimpleAllocator {
+            block_size,
+            // Block 0 is reserved for health checks
+            free_blocks: (1..blocks).collect(),
+            window_size,
+        }
+    }
+}
+
+impl Allocator for SimpleAllocator {
+    fn allocate(
+        &mut self,
+        tokens: u32,
+        _prefill_tokens: Option<Arc<Vec<u32>>>,
+    ) -> Option<BlockAllocation> {
+        // Apply window size
+        let (required_blocks, repeats) = {
+            let (tokens, repeats) = match self.window_size {
+                None => (tokens, 1),
+                Some(window_size) => {
+                    let repeats = (tokens + window_size - 1) / window_size;
+                    let tokens = core::cmp::min(tokens, window_size);
+                    (tokens, repeats as usize)
+                }
+            };
+            // Pad to a multiple of block size
+            let required_blocks = (tokens + self.block_size - 1) / self.block_size;
+            (required_blocks, repeats)
+        };
+
+        let tokens = tokens as usize;
+        if required_blocks > self.free_blocks.len() as u32 {
+            None
+        } else {
+            let blocks = self
+                .free_blocks
+                .split_off(self.free_blocks.len() - required_blocks as usize);
+            let mut slots =
+                Vec::with_capacity((required_blocks * self.block_size * repeats as u32) as usize);
+
+            'slots: for block_id in blocks.repeat(repeats).iter() {
+                for s in (block_id * self.block_size)..((block_id + 1) * self.block_size) {
+                    slots.push(s);
+                    if slots.len() == tokens {
+                        break 'slots;
+                    }
+                }
+            }
+            Some(BlockAllocation {
+                allocation_id: 0,
+                blocks,
+                slots,
+                prefix_len: 0,
+                block_allocator: None,
+            })
+        }
+    }
+
+    fn free(&mut self, blocks: Vec<u32>, _allocation_id: u64) {
+        self.free_blocks.extend(blocks)
+    }
+}
diff --git a/backends/v3/src/client/grpc_client.rs b/backends/v3/src/client/grpc_client.rs
new file mode 100644
index 0000000000000000000000000000000000000000..fe810f24742fdc1da7aa004006a29757b4932087
--- /dev/null
+++ b/backends/v3/src/client/grpc_client.rs
@@ -0,0 +1,299 @@
+/// Single shard Client
+use crate::client::{pb, Chunk};
+use crate::client::{ClientError, Result, WARMUP_IMAGE_BASE64};
+use base64::engine::general_purpose::STANDARD;
+use base64::Engine;
+use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v3::*;
+use std::cmp::min;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tracing::instrument;
+
+/// Text Generation Inference gRPC client
+#[derive(Debug, Clone)]
+pub struct Client {
+    stub: TextGenerationServiceClient<Channel>,
+}
+
+impl Client {
+    /// Returns a client connected to the given url
+    #[allow(dead_code)]
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let channel = Channel::builder(uri).connect().await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let channel = Channel::from_shared("http://[::]:50051".to_string())
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a list of uris or unix sockets of all shards
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v3 interface".to_string())
+        })?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            // Remove unix socket prefix
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+
+    /// Get model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let request = tonic::Request::new(InfoRequest {}).inject_context();
+        let response = self.stub.info(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
+        self.stub.clear_cache(request).await?;
+        Ok(())
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            request_ids,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip_all)]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let mut n_tokens = 0;
+        let mut requests = Vec::new();
+        // Create requests
+        while n_tokens < max_prefill_tokens {
+            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+
+            let mut input_chunks = Vec::new();
+            input_chunks
+                .push(Chunk::Text("_test ".to_string().repeat(max_input_length as usize)).into());
+            if n_tokens == 0 {
+                input_chunks.push(
+                    Chunk::Image(Image {
+                        // Safe unwrap, because we control the data.
+                        data: STANDARD.decode(WARMUP_IMAGE_BASE64).unwrap(),
+                        mimetype: "image/jpeg;base64".to_string(),
+                    })
+                    .into(),
+                );
+            }
+
+            // Send stringly-typed inputs for compatibility for backends that haven't
+            // been updated to support chunks.
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
+            requests.push(Request {
+                id: 0,
+                inputs,
+                add_special_tokens: true,
+                input_chunks: Some(Input {
+                    chunks: input_chunks,
+                }),
+                // We truncate the input on the server side to be sure that it has the correct size
+                truncate,
+                // Blocks and slots will be set on the server side if we use paged attention
+                blocks: vec![],
+                slots: vec![],
+                cache_len: 0,
+                chunk_len: None,
+                // Set sampling parameters to also take these ops into account in the max memory
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 0.9,
+                    top_k: 10,
+                    top_p: 0.9,
+                    typical_p: 0.9,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
+                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens: max_total_tokens - truncate,
+                    stop_sequences: vec![],
+                    ignore_eos_token: true,
+                }),
+                prefill_logprobs: true,
+                top_n_tokens: 20,
+                adapter_id: None,
+            });
+            n_tokens += max_input_length;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
+        }
+
+        let batch = Batch {
+            id: 0,
+            size: requests.len() as u32,
+            requests,
+            max_tokens: max_input_length,
+            max_blocks: 0,
+        };
+
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
+        let response = self.stub.warmup(request).await?.into_inner();
+        Ok(response.max_supported_total_tokens)
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+        cached_batch: Option<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let request = tonic::Request::new(PrefillRequest {
+            batch: Some(batch),
+            cached_batch,
+        })
+        .inject_context();
+        let response = self.stub.prefill(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
+        let response = self.stub.decode(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
diff --git a/backends/v3/src/client/mod.rs b/backends/v3/src/client/mod.rs
new file mode 100644
index 0000000000000000000000000000000000000000..d4ac50c9c46c24a82a0655eae5a2bd8c4e3728dc
--- /dev/null
+++ b/backends/v3/src/client/mod.rs
@@ -0,0 +1,67 @@
+//! Text Generation gRPC client library
+
+use async_trait::async_trait;
+use thiserror::Error;
+use tonic::transport;
+use tonic::Status;
+
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod grpc_client;
+mod sharded_client;
+
+pub use grpc_client::Client;
+pub use pb::generate::v3::{
+    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
+    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
+    StoppingCriteriaParameters,
+};
+pub use sharded_client::ShardedClient;
+
+#[async_trait]
+pub trait Health {
+    /// Check if a generate server is healthy by asking it to allocate a tensor on device
+    async fn device_health(&self) -> Result<()>;
+
+    /// Check if a generate server is healthy by doing a forward pass.
+    /// EXPENSIVE
+    async fn model_health(&self) -> Result<()>;
+}
+
+#[derive(Error, Debug, Clone)]
+pub enum ClientError {
+    #[error("Could not connect to Text Generation server: {0}")]
+    Connection(String),
+    #[error("Server error: {0}")]
+    Generation(String),
+    #[error("Sharded results are empty")]
+    EmptyResults,
+}
+
+impl From<Status> for ClientError {
+    fn from(err: Status) -> Self {
+        let err = Self::Generation(err.message().to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+impl From<transport::Error> for ClientError {
+    fn from(err: transport::Error) -> Self {
+        let err = Self::Connection(err.to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+// Small convenience re-wrapping of `Chunk`.
+impl From<Chunk> for InputChunk {
+    fn from(chunk: Chunk) -> Self {
+        InputChunk { chunk: Some(chunk) }
+    }
+}
+
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+
+pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/router/client/src/v3/pb/generate.v3.rs b/backends/v3/src/client/pb/generate.v3.rs
similarity index 95%
rename from router/client/src/v3/pb/generate.v3.rs
rename to backends/v3/src/client/pb/generate.v3.rs
index 72315ea31ecf55b2aff4cdbc865859833d8e6633..27b45d191ba9131b6ee452c488b91b3aa7abef11 100644
--- a/router/client/src/v3/pb/generate.v3.rs
+++ b/backends/v3/src/client/pb/generate.v3.rs
@@ -22,6 +22,14 @@ pub struct InfoResponse {
     pub window_size: ::core::option::Option<u32>,
     #[prost(uint32, tag = "5")]
     pub speculate: u32,
+    #[prost(bool, tag = "6")]
+    pub support_chunking: bool,
+    #[prost(bool, tag = "7")]
+    pub use_prefix_caching: bool,
+    #[prost(string, tag = "8")]
+    pub attention_impl: ::prost::alloc::string::String,
+    #[prost(uint32, tag = "9")]
+    pub block_size: u32,
 }
 /// / Empty request
 #[allow(clippy::derive_partial_eq_without_eq)]
@@ -167,6 +175,17 @@ pub struct Request {
     /// / LORA adapter index
     #[prost(string, optional, tag = "11")]
     pub adapter_id: ::core::option::Option<::prost::alloc::string::String>,
+    /// / Tokens that can be retrieved from the KV cache.
+    /// / This value is set for the first prefill and never reset
+    #[prost(uint32, tag = "12")]
+    pub cache_len: u32,
+    /// / Context truncation
+    #[prost(bool, tag = "13")]
+    pub add_special_tokens: bool,
+    /// / Chunk of tokens that must be computed for the first prefill
+    /// / This value is set for the first prefill and never reset
+    #[prost(uint32, optional, tag = "14")]
+    pub chunk_len: ::core::option::Option<u32>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -202,6 +221,9 @@ pub struct CachedBatch {
     /// / Maximum number of tokens this batch will grow to
     #[prost(uint32, tag = "4")]
     pub max_tokens: u32,
+    /// / Number of tokens in the next forward
+    #[prost(uint32, tag = "5")]
+    pub current_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -276,6 +298,9 @@ pub struct PrefillRequest {
     /// / Batch
     #[prost(message, optional, tag = "1")]
     pub batch: ::core::option::Option<Batch>,
+    /// / Optional cached batch
+    #[prost(message, optional, tag = "2")]
+    pub cached_batch: ::core::option::Option<CachedBatch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -295,6 +320,9 @@ pub struct PrefillResponse {
     /// / Total elapsed time in nanoseconds
     #[prost(uint64, tag = "5")]
     pub total_ns: u64,
+    /// / Concatenate elapsed time in nanoseconds
+    #[prost(uint64, optional, tag = "6")]
+    pub concat_ns: ::core::option::Option<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
diff --git a/router/client/src/v3/pb/mod.rs b/backends/v3/src/client/pb/mod.rs
similarity index 100%
rename from router/client/src/v3/pb/mod.rs
rename to backends/v3/src/client/pb/mod.rs
diff --git a/backends/v3/src/client/sharded_client.rs b/backends/v3/src/client/sharded_client.rs
new file mode 100644
index 0000000000000000000000000000000000000000..e181cd28d2fdba0809569dc4321f209261ba65c6
--- /dev/null
+++ b/backends/v3/src/client/sharded_client.rs
@@ -0,0 +1,252 @@
+use crate::client::Health;
+/// Multi shard Client
+use crate::client::{ClientError, Result};
+
+use crate::client::grpc_client::{DecodeTimings, PrefillTimings};
+use crate::client::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+use crate::client::{Chunk, InfoResponse, Input};
+use async_trait::async_trait;
+use futures::future::join_all;
+use tonic::transport::Uri;
+use tracing::instrument;
+
+#[derive(Debug, Clone)]
+/// Text Generation Inference gRPC multi client
+pub struct ShardedClient {
+    clients: Vec<Client>,
+}
+
+impl ShardedClient {
+    fn new(clients: Vec<Client>) -> Self {
+        Self { clients }
+    }
+
+    /// Create a new ShardedClient from a master client. The master client will communicate with
+    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
+    async fn from_master_client(mut master_client: Client) -> Result<Self> {
+        // Get all uris/unix sockets from the master client
+        let uris = master_client.service_discovery().await?;
+        let futures = uris.into_iter().map(Client::connect_uds);
+        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        Ok(Self::new(clients?))
+    }
+
+    /// Returns a client connected to the given uri
+    #[allow(dead_code)]
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let master_client = Client::connect(uri).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let master_client = Client::connect_uds(path).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Get the model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.info())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.clear_cache(batch_id))
+            .collect();
+        join_all(futures).await.into_iter().collect()
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip(self))]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
+            .collect();
+        // Take the minimum value
+        let results = join_all(futures)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<Option<u32>>>>()?;
+        Ok(results.into_iter().flatten().min())
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+        cached_batch: Option<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.decode(batches.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text("liveness".into()).into()],
+            }),
+            truncate: 10,
+            add_special_tokens: true,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+            // Block 0 is reserved for health checks
+            blocks: vec![0],
+            slots: (0..16).collect(),
+            cache_len: 0,
+            adapter_id: None,
+            chunk_len: None,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+            max_blocks: 1,
+        };
+        self.clone().prefill(batch, None).await?;
+        Ok(())
+    }
+}
diff --git a/backends/v3/src/lib.rs b/backends/v3/src/lib.rs
new file mode 100644
index 0000000000000000000000000000000000000000..7daf9eaeca7ae7a52bffce5d872ed93c67b9d4c8
--- /dev/null
+++ b/backends/v3/src/lib.rs
@@ -0,0 +1,154 @@
+mod backend;
+pub mod block_allocator;
+mod client;
+mod queue;
+pub mod radix;
+
+use crate::client::{ClientError, ShardedClient};
+pub(crate) use backend::BackendV3;
+use serde::Serialize;
+use thiserror::Error;
+use utoipa::ToSchema;
+
+#[derive(Clone, Debug, Serialize, ToSchema)]
+pub struct BackendInfo {
+    /// Mandatory
+    #[schema(example = "cuda")]
+    pub model_device_type: String,
+    #[schema(example = "torch.float16")]
+    pub model_dtype: String,
+
+    /// Backend parameters
+    #[schema(example = "1")]
+    pub speculate: usize,
+    #[schema(example = "1.2")]
+    pub waiting_served_ratio: f32,
+    #[schema(example = "32000")]
+    pub max_batch_total_tokens: u32,
+    #[schema(example = "20")]
+    pub max_waiting_tokens: usize,
+    #[schema(nullable = true, example = "null")]
+    pub max_batch_size: Option<usize>,
+    #[schema(example = "false")]
+    pub support_chunking: bool,
+    #[schema(example = "false")]
+    pub prefix_caching: bool,
+    #[schema(example = "flashinfer")]
+    pub attention_impl: String,
+    #[schema(example = "1")]
+    pub block_size: u32,
+}
+
+#[allow(clippy::too_many_arguments)]
+pub async fn connect_backend(
+    max_input_tokens: usize,
+    max_total_tokens: usize,
+    master_shard_uds_path: String,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: Option<u32>,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+) -> Result<(BackendV3, BackendInfo), V3Error> {
+    // Helper function
+    let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
+        match max_supported_batch_total_tokens {
+            // Older models do not support automatic max-batch-total-tokens
+            None => {
+                let max_batch_total_tokens = max_batch_total_tokens
+                    .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)));
+                tracing::warn!("Model does not support automatic max batch total tokens");
+                Ok(max_batch_total_tokens)
+            }
+            // Flash attention models return their max supported total tokens
+            Some(max_supported_batch_total_tokens) => {
+                // Warn if user added his own max-batch-total-tokens as we will ignore it
+                if max_batch_total_tokens.is_some() {
+                    tracing::warn!(
+                        "`--max-batch-total-tokens` is deprecated for Flash \
+                        Attention models."
+                    );
+                    tracing::warn!(
+                        "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
+                    );
+                }
+                if max_total_tokens as u32 > max_supported_batch_total_tokens {
+                    return Err(V3Error::NotEnoughMemory(max_total_tokens));
+                }
+
+                Ok(max_supported_batch_total_tokens)
+            }
+        }
+    };
+
+    let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
+        .await
+        .map_err(V3Error::Connection)?;
+
+    // server is running on v3
+    // Clear the cache; useful if the webserver rebooted
+    sharded_client
+        .clear_cache(None)
+        .await
+        .map_err(V3Error::Cache)?;
+    // Get info from the shard
+    let shard_info = sharded_client.info().await.map_err(V3Error::Info)?;
+
+    // Warmup model
+    tracing::info!("Warming up model");
+    let max_batch_total_tokens = check_max_batch_total_tokens(
+        sharded_client
+            .warmup(
+                max_input_tokens as u32,
+                max_batch_prefill_tokens,
+                max_total_tokens as u32,
+                max_batch_size,
+            )
+            .await
+            .map_err(V3Error::Warmup)?,
+    )?;
+    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
+    metrics::gauge!("tgi_batch_max_total_tokens").set(max_batch_total_tokens);
+
+    let backend_info = BackendInfo {
+        waiting_served_ratio,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        model_device_type: shard_info.device_type.clone(),
+        model_dtype: shard_info.dtype.clone(),
+        speculate: shard_info.speculate as usize,
+        support_chunking: shard_info.support_chunking,
+        prefix_caching: shard_info.use_prefix_caching,
+        attention_impl: shard_info.attention_impl.clone(),
+        block_size: shard_info.block_size,
+    };
+
+    let backend = BackendV3::new(
+        sharded_client,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        shard_info,
+    );
+
+    tracing::info!("Using backend V3");
+
+    Ok((backend, backend_info))
+}
+
+#[derive(Debug, Error)]
+pub enum V3Error {
+    #[error("Unable to clear the Python model shards cache: {0}")]
+    Cache(ClientError),
+    #[error("Unable to connect to the Python model shards: {0}")]
+    Connection(ClientError),
+    #[error("Unable to get the Python model shards info: {0}")]
+    Info(ClientError),
+    #[error("Unable to warmup the Python model shards: {0}")]
+    Warmup(ClientError),
+    #[error("Not enough memory to handle `max_total_tokens={0}`")]
+    NotEnoughMemory(usize),
+}
diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..bc4bdb934ebd1115b17a333b0a11ddb0ccd59e23
--- /dev/null
+++ b/backends/v3/src/main.rs
@@ -0,0 +1,212 @@
+use clap::{Parser, Subcommand};
+use text_generation_router::{server, usage_stats};
+use text_generation_router_v3::{connect_backend, V3Error};
+use thiserror::Error;
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[command(subcommand)]
+    command: Option<Commands>,
+
+    #[clap(default_value = "128", long, env)]
+    max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+    #[clap(default_value = "1.2", long, env)]
+    waiting_served_ratio: f32,
+    #[clap(default_value = "4096", long, env)]
+    max_batch_prefill_tokens: u32,
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<u32>,
+    #[clap(default_value = "20", long, env)]
+    max_waiting_tokens: usize,
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
+    #[clap(default_value = "0.0.0.0", long, env)]
+    hostname: String,
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
+    master_shard_uds_path: String,
+    #[clap(default_value = "bigscience/bloom", long, env)]
+    tokenizer_name: String,
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
+    revision: Option<String>,
+    #[clap(long, env, value_enum)]
+    trust_remote_code: bool,
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+    #[clap(long, env)]
+    api_key: Option<String>,
+    #[clap(long, env)]
+    json_output: bool,
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+    #[clap(long, env)]
+    ngrok: bool,
+    #[clap(long, env)]
+    ngrok_authtoken: Option<String>,
+    #[clap(long, env)]
+    ngrok_edge: Option<String>,
+    #[clap(long, env, default_value_t = false)]
+    disable_grammar_support: bool,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
+}
+
+#[derive(Debug, Subcommand)]
+enum Commands {
+    PrintSchema,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), RouterError> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        command,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        hostname,
+        port,
+        master_shard_uds_path,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        trust_remote_code,
+        validation_workers,
+        api_key,
+        json_output,
+        otlp_endpoint,
+        otlp_service_name,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    } = args;
+
+    if let Some(Commands::PrintSchema) = command {
+        use utoipa::OpenApi;
+        let api_doc = text_generation_router::server::ApiDoc::openapi();
+        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
+        println!("{}", api_doc);
+        std::process::exit(0);
+    };
+    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
+
+    // Validate args
+    if max_input_tokens >= max_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+
+    if validation_workers == 0 {
+        return Err(RouterError::ArgumentValidation(
+            "`validation_workers` must be > 0".to_string(),
+        ));
+    }
+    if let Some(max_batch_size) = max_batch_size {
+        if max_batch_size == 0 {
+            return Err(RouterError::ArgumentValidation(
+                "`max_batch_size` must be > 0".to_string(),
+            ));
+        }
+    }
+
+    let (backend, backend_info) = connect_backend(
+        max_input_tokens,
+        max_total_tokens,
+        master_shard_uds_path,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+    )
+    .await?;
+
+    // Validate remaining args now that the backend is known
+    let support_chunking = backend_info.support_chunking;
+    let max_batch_total_tokens = backend_info.max_batch_total_tokens;
+    if max_input_tokens as u32 > max_batch_prefill_tokens && !support_chunking {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+    if max_batch_prefill_tokens > max_batch_total_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+    }
+    if max_total_tokens as u32 > max_batch_total_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+    }
+
+    // Run server
+    server::run(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        api_key,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        trust_remote_code,
+        hostname,
+        port,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    )
+    .await?;
+    Ok(())
+}
+
+#[derive(Debug, Error)]
+enum RouterError {
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("Backend failed: {0}")]
+    Backend(#[from] V3Error),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Tokio runtime failed to start: {0}")]
+    Tokio(#[from] std::io::Error),
+}
diff --git a/router/src/infer/v3/queue.rs b/backends/v3/src/queue.rs
similarity index 74%
rename from router/src/infer/v3/queue.rs
rename to backends/v3/src/queue.rs
index ba65b9b6a8c1005e9acca8448bf8fa328bbb6d44..6662b8de1f9240ab4e22d2b27dc65e8961bb74ba 100644
--- a/router/src/infer/v3/queue.rs
+++ b/backends/v3/src/queue.rs
@@ -1,17 +1,17 @@
-use crate::infer::v3::block_allocator::{BlockAllocation, BlockAllocator};
-use crate::infer::InferError;
-use crate::infer::InferStreamResponse;
-use crate::validation::{
-    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
+use crate::block_allocator::{BlockAllocation, BlockAllocator};
+use crate::client;
+use crate::client::{
+    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
 };
 use nohash_hasher::{BuildNoHashHasher, IntMap};
-use std::cmp::{max, min};
+use std::cmp::max;
 use std::collections::VecDeque;
-use text_generation_client::v3::{
-    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+use text_generation_router::infer::InferError;
+use text_generation_router::infer::InferStreamResponse;
+use text_generation_router::validation::{
+    Chunk, ChunksToString, ValidGenerateRequest, ValidGrammar, ValidParameters,
+    ValidStoppingParameters,
 };
-use text_generation_client::ChunksToString;
-use text_generation_client::Input;
 use tokio::sync::{mpsc, oneshot};
 use tokio::time::Instant;
 use tracing::{info_span, instrument, Instrument, Span};
@@ -46,9 +46,11 @@ impl Queue {
     pub(crate) fn new(
         requires_padding: bool,
         block_size: u32,
+        prefix_caching: bool,
         window_size: Option<u32>,
         speculate: u32,
         max_batch_total_tokens: u32,
+        support_chunking: bool,
     ) -> Self {
         // Create channel
         let (queue_sender, queue_receiver) = mpsc::unbounded_channel();
@@ -57,9 +59,11 @@ impl Queue {
         tokio::spawn(queue_task(
             requires_padding,
             block_size,
+            prefix_caching,
             window_size,
             speculate,
             max_batch_total_tokens,
+            support_chunking,
             queue_receiver,
         ));
 
@@ -85,6 +89,10 @@ impl Queue {
         prefill_token_budget: u32,
         token_budget: u32,
     ) -> Option<NextBatch> {
+        if prefill_token_budget == 0 || token_budget == 0 {
+            return None;
+        };
+
         // Create response channel
         let (response_sender, response_receiver) = oneshot::channel();
         // Send next batch command to the background task managing the state
@@ -106,27 +114,32 @@ impl Queue {
 }
 
 // Background task responsible of the queue state
+#[allow(clippy::too_many_arguments)]
 async fn queue_task(
     requires_padding: bool,
     block_size: u32,
+    prefix_caching: bool,
     window_size: Option<u32>,
     speculate: u32,
     max_batch_total_tokens: u32,
+    support_chunking: bool,
     mut receiver: mpsc::UnboundedReceiver<QueueCommand>,
 ) {
     let mut state = State::new(
         requires_padding,
         block_size,
+        prefix_caching,
         window_size,
         speculate,
         max_batch_total_tokens,
+        support_chunking,
     );
 
     while let Some(cmd) = receiver.recv().await {
         match cmd {
             QueueCommand::Append(entry, span) => {
                 span.in_scope(|| state.append(*entry));
-                metrics::increment_gauge!("tgi_queue_size", 1.0);
+                metrics::gauge!("tgi_queue_size").increment(1.0);
             }
             QueueCommand::NextBatch {
                 min_size,
@@ -141,7 +154,7 @@ async fn queue_task(
                     .instrument(span)
                     .await;
                 response_sender.send(next_batch).unwrap();
-                metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
+                metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64);
             }
         }
     }
@@ -162,12 +175,14 @@ struct State {
     /// Paged Attention block size
     block_size: u32,
 
-    /// Sliding window
-    window_size: Option<u32>,
-
     /// Speculation amount
     speculate: u32,
 
+    /// Whether the model allow the prefill chunking
+    /// If it does, the last request in the batch will be split to exactly match the prefill
+    /// token budget
+    support_chunking: bool,
+
     /// Paged Attention Block Allocation
     block_allocator: Option<BlockAllocator>,
 }
@@ -176,20 +191,28 @@ impl State {
     fn new(
         requires_padding: bool,
         block_size: u32,
+        prefix_caching: bool,
         window_size: Option<u32>,
         speculate: u32,
         max_batch_total_tokens: u32,
+        support_chunking: bool,
     ) -> Self {
-        let block_allocator = (!requires_padding)
-            .then(|| BlockAllocator::new(max_batch_total_tokens, block_size, window_size));
+        let block_allocator = (!requires_padding).then(|| {
+            BlockAllocator::new(
+                max_batch_total_tokens,
+                block_size,
+                prefix_caching,
+                window_size,
+            )
+        });
 
         Self {
             entries: VecDeque::with_capacity(128),
             next_id: 0,
             next_batch_id: 0,
             block_size,
-            window_size,
             speculate,
+            support_chunking,
             block_allocator,
         }
     }
@@ -226,29 +249,33 @@ impl State {
             }
         }
 
+        if let Some(max_size) = max_size {
+            if max_size == 0 {
+                tracing::debug!("No capacity");
+                return None;
+            }
+        }
+
         // Pad prefill_token_budget to be a multiple of block size
         let prefill_token_budget =
             ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
 
         // Create span for this batch to add context to inference calls
         let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
-        next_batch_span.follows_from(&Span::current());
-
-        let mut batch_requests = Vec::with_capacity(self.entries.len());
-        let mut batch_entries =
-            IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());
+        next_batch_span.follows_from(Span::current());
 
+        let mut batch = Vec::with_capacity(self.entries.len());
         let mut max_input_length = 0;
         let mut prefill_tokens: u32 = 0;
         let mut decode_tokens: u32 = 0;
         let mut max_blocks = 0;
 
         // Pop entries starting from the front of the queue
-        'entry_loop: while let Some((id, mut entry)) = self.entries.pop_front() {
+        'entry_loop: while let Some((id, entry)) = self.entries.pop_front() {
             // Filter entries where the response receiver was dropped (== entries where the request
             // was dropped by the client)
             if entry.response_tx.is_closed() {
-                metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+                metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
                 tracing::debug!("Dropping entry");
                 continue;
             }
@@ -258,7 +285,7 @@ impl State {
                     // We pad to max input length in the Python shards
                     // We need to take these padding tokens into the equation
                     max_input_length = max_input_length.max(entry.request.input_length);
-                    prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length;
+                    prefill_tokens = (batch.len() + 1) as u32 * max_input_length;
 
                     decode_tokens += entry.request.stopping_parameters.max_new_tokens;
                     let total_tokens = prefill_tokens + decode_tokens + self.speculate;
@@ -273,32 +300,21 @@ impl State {
                     None
                 }
                 Some(block_allocator) => {
-                    prefill_tokens += entry.request.input_length;
-                    let max_new_tokens = match self.window_size {
-                        None => entry.request.stopping_parameters.max_new_tokens,
-                        Some(window_size) => min(
-                            window_size.saturating_sub(entry.request.input_length),
-                            entry.request.stopping_parameters.max_new_tokens,
-                        ),
+                    // If users wants the prefill logprobs, we cannot reuse the cache.
+                    // So no input_ids for the radix tree.
+                    let input_ids = if entry.request.decoder_input_details {
+                        None
+                    } else {
+                        entry.request.input_ids.clone()
                     };
-                    decode_tokens += max_new_tokens;
-
-                    if prefill_tokens > prefill_token_budget
-                        || (prefill_tokens + decode_tokens + self.speculate) > token_budget
-                    {
-                        // Entry is over budget
-                        // Add it back to the front
-                        tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
-                        self.entries.push_front((id, entry));
-                        break;
-                    }
 
                     let tokens = entry.request.input_length
                         + entry.request.stopping_parameters.max_new_tokens
                         + self.speculate
                         - 1;
+                    tracing::debug!("Allocating {tokens} with {input_ids:?}");
 
-                    match block_allocator.allocate(tokens).await {
+                    let block_allocation = match block_allocator.allocate(tokens, input_ids).await {
                         None => {
                             // Entry is over budget
                             // Add it back to the front
@@ -306,16 +322,88 @@ impl State {
                             self.entries.push_front((id, entry));
                             break 'entry_loop;
                         }
-                        Some(block_allocation) => {
+                        Some(mut block_allocation) => {
                             tracing::debug!("Allocation: {block_allocation:?}");
                             max_blocks = max(max_blocks, block_allocation.blocks.len() as u32);
-                            Some(block_allocation)
+
+                            if block_allocation.prefix_len == entry.request.input_length {
+                                // The whole request was found in the radix trie
+                                // However, for the transformer forward to work, we need to
+                                // have at least one token of postfix.
+                                block_allocation.prefix_len -= 1;
+                            }
+
+                            block_allocation
+                        }
+                    };
+
+                    let postfix_len = entry.request.input_length - block_allocation.prefix_len;
+
+                    if prefill_tokens + postfix_len > prefill_token_budget {
+                        // Entry is over budget
+                        if self.support_chunking {
+                            // We support chunking, just set postfix_len to exactly match prefill_token_budget
+                            let chunk_len = prefill_token_budget.saturating_sub(prefill_tokens);
+                            if chunk_len > 0 {
+                                // Push this entry inside the batch
+                                batch.push((id, entry, Some(block_allocation), Some(chunk_len)));
+                            } else {
+                                // We cannot prefill even one token for this entry
+                                // Add it back to the queue
+                                self.entries.push_front((id, entry));
+                            }
+                            tracing::debug!(
+                                "Matched budget: prefill_tokens={} == {prefill_token_budget}",
+                                prefill_tokens + postfix_len
+                            );
+                            break 'entry_loop;
+                        } else {
+                            // We don't support chunking, this entry needs to go back to the buffer
+                            // Add it back to the front
+                            tracing::debug!(
+                                "Over budget: prefill_tokens={} > {prefill_token_budget}",
+                                prefill_tokens + postfix_len
+                            );
+                            self.entries.push_front((id, entry));
+                            break 'entry_loop;
                         }
                     }
+
+                    prefill_tokens += postfix_len;
+
+                    Some(block_allocation)
                 }
             };
+            batch.push((id, entry, block_allocation, None));
+            if Some(batch.len()) == max_size {
+                break;
+            }
+        }
 
-            tracing::debug!("Accepting entry");
+        // Empty batch
+        if batch.is_empty() {
+            tracing::debug!("Filterered out all entries");
+            return None;
+        }
+
+        // XXX We haven't allocated yet, so we're allowed to ditch the results.
+        // Check if our batch is big enough
+        if let Some(min_size) = min_size {
+            // Batch is too small
+            if batch.len() < min_size {
+                // Add back entries to the queue in the correct order
+                for (id, entry, _, _) in batch.into_iter().rev() {
+                    self.entries.push_front((id, entry));
+                }
+                return None;
+            }
+        }
+
+        let mut batch_requests = Vec::with_capacity(self.entries.len());
+        let mut batch_entries =
+            IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());
+
+        for (id, mut entry, block_allocation, chunk_len) in batch {
             // Create a new span to link the batch back to this entry
             let entry_batch_span = info_span!(parent: &entry.span, "infer");
             // Add relationships
@@ -324,11 +412,12 @@ impl State {
             // Update entry
             entry.temp_span = Some(entry_batch_span);
 
-            let (blocks, slots) = match &block_allocation {
-                None => (Vec::new(), Vec::new()),
+            let (blocks, slots, prefix_len) = match &block_allocation {
+                None => (Vec::new(), Vec::new(), 0),
                 Some(block_allocation) => (
                     block_allocation.blocks.clone(),
                     block_allocation.slots.clone(),
+                    block_allocation.prefix_len,
                 ),
             };
 
@@ -337,11 +426,26 @@ impl State {
             batch_requests.push(Request {
                 id,
                 prefill_logprobs: entry.request.decoder_input_details,
-                input_chunks: Some(Input {
-                    chunks: entry.request.inputs.clone(),
+                input_chunks: Some(client::Input {
+                    chunks: entry
+                        .request
+                        .inputs
+                        .clone()
+                        .into_iter()
+                        .map(|c| client::InputChunk {
+                            chunk: Some(match c {
+                                Chunk::Text(text) => client::Chunk::Text(text),
+                                Chunk::Image(image) => client::Chunk::Image(client::Image {
+                                    data: image.data,
+                                    mimetype: image.mimetype,
+                                }),
+                            }),
+                        })
+                        .collect(),
                 }),
                 inputs: entry.request.inputs.chunks_to_string(),
                 truncate: entry.request.truncate,
+                add_special_tokens: entry.request.add_special_tokens,
                 parameters: Some(NextTokenChooserParameters::from(
                     entry.request.parameters.clone(),
                 )),
@@ -351,38 +455,14 @@ impl State {
                 top_n_tokens: entry.request.top_n_tokens,
                 blocks,
                 slots,
+                cache_len: prefix_len,
                 adapter_id: entry.request.adapter_id.clone(),
+                chunk_len,
             });
             // Set batch_time
             entry.batch_time = Some(Instant::now());
             // Insert in batch_entries IntMap
             batch_entries.insert(id, entry);
-
-            // Check if max_size
-            if Some(batch_requests.len()) == max_size {
-                break;
-            }
-        }
-
-        // Empty batch
-        if batch_requests.is_empty() {
-            tracing::debug!("Filterered out all entries");
-            return None;
-        }
-
-        // Check if our batch is big enough
-        if let Some(min_size) = min_size {
-            // Batch is too small
-            if batch_requests.len() < min_size {
-                // Add back entries to the queue in the correct order
-                for r in batch_requests.into_iter().rev() {
-                    let id = r.id;
-                    let entry = batch_entries.remove(&id).unwrap();
-                    self.entries.push_front((id, entry));
-                }
-
-                return None;
-            }
         }
 
         // Final batch size
@@ -399,7 +479,7 @@ impl State {
         // Increment batch id
         self.next_batch_id += 1;
 
-        metrics::histogram!("tgi_batch_next_size", batch.size as f64);
+        metrics::histogram!("tgi_batch_next_size").record(batch.size as f64);
 
         Some((batch_entries, batch, next_batch_span))
     }
@@ -459,6 +539,8 @@ impl From<ValidStoppingParameters> for StoppingCriteriaParameters {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
     use super::*;
     use tracing::info_span;
 
@@ -471,7 +553,9 @@ mod tests {
         let entry = Entry {
             request: ValidGenerateRequest {
                 inputs: vec![],
-                input_length: 0,
+                input_ids: Some(Arc::new(vec![])),
+                input_length: 1,
+                add_special_tokens: true,
                 truncate: 0,
                 decoder_input_details: false,
                 parameters: ValidParameters {
@@ -506,7 +590,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_append() {
-        let mut state = State::new(false, 1, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
         let (entry, _guard) = default_entry();
 
         assert_eq!(state.next_id, 0);
@@ -522,7 +606,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_next_batch_empty() {
-        let mut state = State::new(false, 1, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
 
         assert!(state.next_batch(None, None, 1, 1).await.is_none());
         assert!(state.next_batch(Some(1), None, 1, 1).await.is_none());
@@ -530,7 +614,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_next_batch_min_size() {
-        let mut state = State::new(false, 1, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
@@ -562,7 +646,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_next_batch_max_size() {
-        let mut state = State::new(false, 1, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
@@ -582,7 +666,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_next_batch_token_budget() {
-        let mut state = State::new(false, 1, None, 0, 2);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
@@ -615,14 +699,14 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_append() {
-        let queue = Queue::new(false, 1, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry, _guard) = default_entry();
         queue.append(entry);
     }
 
     #[tokio::test]
     async fn test_queue_next_batch_empty() {
-        let queue = Queue::new(false, 1, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
 
         assert!(queue.next_batch(None, None, 1, 1).await.is_none());
         assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
@@ -630,7 +714,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_min_size() {
-        let queue = Queue::new(false, 1, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -663,7 +747,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_max_size() {
-        let queue = Queue::new(false, 1, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -679,7 +763,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_token_budget() {
-        let queue = Queue::new(false, 1, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -704,7 +788,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_token_speculate() {
-        let queue = Queue::new(false, 1, None, 2, 16);
+        let queue = Queue::new(true, 1, false, None, 2, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -723,7 +807,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_dropped_receiver() {
-        let queue = Queue::new(false, 1, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry, _) = default_entry();
         queue.append(entry);
 
diff --git a/backends/v3/src/radix.rs b/backends/v3/src/radix.rs
new file mode 100644
index 0000000000000000000000000000000000000000..8a544891199228fb61249e37effe0ee94fc42271
--- /dev/null
+++ b/backends/v3/src/radix.rs
@@ -0,0 +1,876 @@
+use crate::block_allocator::{Allocator, BlockAllocation};
+use slotmap::{DefaultKey, SlotMap};
+use std::hash::{Hash, Hasher};
+use std::{
+    collections::{BTreeSet, HashMap},
+    sync::Arc,
+};
+
+fn hash(slice: &[u32]) -> u64 {
+    assert!(!slice.is_empty());
+    if slice.len() == 1 {
+        slice[0] as u64
+    } else {
+        let mut s = std::hash::DefaultHasher::new();
+        slice.hash(&mut s);
+        s.finish()
+    }
+}
+
+pub struct RadixAllocator {
+    allocation_id: u64,
+
+    allocations: HashMap<u64, RadixAllocation>,
+
+    cache_blocks: RadixTrie,
+
+    /// Blocks that are immediately available for allocation.
+    free_blocks: Vec<u32>,
+
+    #[allow(dead_code)]
+    // This isn't used because the prefix need to match without the windowing
+    // mecanism. This at worst is overallocating, not necessarily being wrong.
+    window_size: Option<u32>,
+
+    block_size: u32,
+}
+
+impl RadixAllocator {
+    pub fn new(block_size: u32, n_blocks: u32, window_size: Option<u32>) -> Self {
+        RadixAllocator {
+            allocation_id: 0,
+            allocations: HashMap::new(),
+            cache_blocks: RadixTrie::new(block_size as usize),
+
+            // Block 0 is reserved for health checks.
+            free_blocks: (1..n_blocks).collect(),
+            window_size,
+            block_size,
+        }
+    }
+
+    fn alloc_or_reclaim(&mut self, n_blocks_needed: usize) -> Option<Vec<u32>> {
+        if self.free_blocks.len() < n_blocks_needed {
+            // This is a bit annoying, we first extend the free list and then
+            // split it off again below. This is because we need to put it on
+            // the free list if we cannot allocate enough blocks. This is only
+            // temporary, the trie needs to be able to report whether it can
+            // allocate the requested amount. Just not implemented yet.
+            tracing::debug!(
+                "Free blocks {}  need {n_blocks_needed}",
+                self.free_blocks.len()
+            );
+            self.free_blocks.extend(
+                self.cache_blocks
+                    .evict(n_blocks_needed - self.free_blocks.len()),
+            );
+        }
+
+        if self.free_blocks.len() >= n_blocks_needed {
+            Some(
+                self.free_blocks
+                    .split_off(self.free_blocks.len() - n_blocks_needed),
+            )
+        } else {
+            None
+        }
+    }
+}
+
+// Allocator trait
+impl Allocator for RadixAllocator {
+    fn allocate(
+        &mut self,
+        tokens: u32,
+        prefill_tokens: Option<Arc<Vec<u32>>>,
+    ) -> Option<BlockAllocation> {
+        let mut blocks = vec![];
+        let prefix_node = if let Some(prefill_tokens) = prefill_tokens.as_ref() {
+            let node_id = self
+                .cache_blocks
+                .find(prefill_tokens.as_slice(), &mut blocks);
+            node_id
+        } else {
+            self.cache_blocks.root_id()
+        };
+
+        // Even if this allocation fails below, we need to increase he
+        // refcount to ensure that the prefix that was found is not evicted.
+        self.cache_blocks
+            .incref(prefix_node)
+            .expect("Failed to increment refcount");
+
+        let prefix_len = blocks.len() * self.block_size as usize;
+        let suffix_len = tokens - prefix_len as u32;
+
+        let suffix_blocks = (suffix_len + self.block_size - 1) / self.block_size;
+
+        tracing::info!("Prefix {prefix_len} - Suffix {suffix_len}");
+
+        match self.alloc_or_reclaim(suffix_blocks as usize) {
+            Some(suffix_blocks) => blocks.extend(suffix_blocks),
+            None => {
+                tracing::debug!("Cannot allocate {:?}", self.cache_blocks);
+                tracing::debug!("Found {prefix_len} prefix tokens need {suffix_blocks} suffix blocks for {tokens} tokens");
+                tracing::debug!("Block size {}", self.block_size);
+                self.cache_blocks
+                    .decref(prefix_node)
+                    .expect("Failed to decrement refcount");
+                return None;
+            }
+        }
+
+        // 1:1 mapping of blocks and slots.
+        let slots = if self.block_size == 1 {
+            blocks.clone()
+        } else {
+            let mut slots = Vec::with_capacity(blocks.len() * self.block_size as usize);
+            'slots: for block_id in &blocks {
+                for s in (block_id * self.block_size)..((block_id + 1) * self.block_size) {
+                    slots.push(s);
+                    if slots.len() as u32 == tokens {
+                        break 'slots;
+                    }
+                }
+            }
+            slots
+        };
+
+        let allocation = RadixAllocation {
+            prefix_node,
+            cached_prefix_len: prefix_len,
+            prefill_tokens: prefill_tokens.clone(),
+        };
+
+        self.allocation_id += 1;
+        self.allocations.insert(self.allocation_id, allocation);
+
+        Some(BlockAllocation {
+            allocation_id: self.allocation_id,
+            block_allocator: None,
+            blocks,
+            slots,
+            prefix_len: prefix_len as u32,
+        })
+    }
+
+    fn free(&mut self, blocks: Vec<u32>, allocation_id: u64) {
+        let allocation = match self.allocations.remove(&allocation_id) {
+            Some(allocation) => allocation,
+            None => unreachable!("Tried to free an unknown allocation."),
+        };
+
+        self.cache_blocks
+            .decref(allocation.prefix_node)
+            .expect("Failed to decrement refcount");
+
+        if let Some(prefill_tokens) = allocation.prefill_tokens {
+            let prefill_tokens = prefill_tokens.as_slice();
+
+            // If there are prefill tokens that did not come from the cache,
+            // add them to the cache.
+            if prefill_tokens.len() > allocation.cached_prefix_len {
+                let aligned =
+                    (prefill_tokens.len() / self.block_size as usize) * self.block_size as usize;
+                if aligned > 0 {
+                    let prefix_len = self
+                        .cache_blocks
+                        .insert(
+                            &prefill_tokens[..aligned],
+                            &blocks[..aligned / self.block_size as usize],
+                        )
+                        // Unwrap, failing is a programming error.
+                        .expect("Failed to store prefill tokens");
+                    // We can have a prefill with the following structure:
+                    //
+                    // |---| From the prefix cache.
+                    // A B C D E F G
+                    //|--------| Found in the trie during insertion.
+                    //
+                    // This means that while processing this request there was a
+                    // partially overlapping request that had A..=E in its
+                    // prefill. In this case we need to free the blocks D E.
+                    if prefix_len > allocation.cached_prefix_len {
+                        self.free_blocks.extend(
+                            &blocks[allocation.cached_prefix_len / self.block_size as usize
+                                ..prefix_len / self.block_size as usize],
+                        );
+                    }
+                }
+            }
+
+            // Free non-prefill blocks.
+            self.free_blocks
+                .extend(&blocks[prefill_tokens.len() / self.block_size as usize..]);
+        } else {
+            self.free_blocks.extend(blocks);
+        }
+    }
+}
+
+struct RadixAllocation {
+    prefix_node: NodeId,
+    cached_prefix_len: usize,
+    prefill_tokens: Option<Arc<Vec<u32>>>,
+}
+
+// Radix trie that is heavily inspired by radix attention from sglang.
+//
+// The trie is optimized for prefix caching:
+//
+// - A normal radix trie stores discrete values. In this radix trie,
+//   inserting *abc* with value *xyz* will also enable lookup for
+//   *a* (*x*) and *ab* (*xy*).
+// - As a result, every value is required to have the same length as
+//   the key.
+// - We store additional information in each node, such as last access
+//   time and a reference count.
+
+#[derive(Debug)]
+pub enum TrieError {
+    InvalidNodeId,
+    RefCountUnderflow,
+}
+
+pub type NodeId = DefaultKey;
+
+#[derive(Debug)]
+pub struct RadixTrie {
+    /// Identifier of the root nod.
+    root: DefaultKey,
+
+    /// Leave node identifiers ordered by increasing recency.
+    leaves: BTreeSet<(u64, NodeId)>,
+
+    /// All trie nodes.
+    nodes: SlotMap<NodeId, TrieNode>,
+
+    /// Time as a monotonically increating counter to avoid the system
+    /// call that a real time lookup would require.
+    time: u64,
+
+    /// All blocks need to be aligned with this
+    block_size: usize,
+}
+
+impl RadixTrie {
+    /// Construct a new radix trie.
+    pub fn new(block_size: usize) -> Self {
+        let root = TrieNode::new(vec![], vec![], 0, None);
+        let mut nodes = SlotMap::new();
+        let root = nodes.insert(root);
+        RadixTrie {
+            leaves: BTreeSet::new(),
+            nodes,
+            root,
+            time: 0,
+            block_size,
+        }
+    }
+
+    /// Find the prefix of the given tokens.
+    ///
+    /// The blocks corresponding to the part of the prefix that could be found
+    /// are written to `blocks`. The number of blocks is in `0..=tokens.len()`.
+    /// Returns the identifier of the trie node that contains the longest
+    /// prefix. The node identifier can be used by callers to e.g. increase its
+    /// reference count.
+    ///
+    /// Using this method will update the access time of the traversed nodes.
+    pub fn find(&mut self, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
+        self.time += 1;
+        self.find_(self.root, key, blocks)
+    }
+
+    /// Find worker.
+    fn find_(&mut self, mut node_id: NodeId, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
+        let node = &self.nodes[node_id];
+
+        if key.len() >= self.block_size {
+            let node_key = hash(&key[..self.block_size]);
+            if let Some(&child_id) = node.children.get(&node_key) {
+                self.update_access_time(child_id);
+                let child = self.nodes.get(child_id).expect("Invalid child identifier");
+                let shared_prefix_len = shared_prefix(&child.key, key, self.block_size);
+                assert_eq!(shared_prefix_len % self.block_size, 0);
+                blocks.extend(&child.blocks[..shared_prefix_len / self.block_size]);
+
+                let key = &key[shared_prefix_len..];
+                if !key.is_empty() {
+                    node_id = self.find_(child_id, key, blocks);
+                }
+            }
+        }
+
+        node_id
+    }
+
+    /// Decrease the reference count of a node.
+    pub fn decref(&mut self, node_id: NodeId) -> Result<(), TrieError> {
+        // We don't care about refcounting for root, since it will never
+        // be evicted.
+        if node_id == self.root {
+            return Ok(());
+        }
+
+        let node = self
+            .nodes
+            .get_mut(node_id)
+            .ok_or(TrieError::InvalidNodeId)?;
+        if node.ref_count == 0 {
+            return Err(TrieError::RefCountUnderflow);
+        }
+
+        node.ref_count -= 1;
+        if node.ref_count == 0 {
+            assert!(
+                node.children.is_empty(),
+                "Nodes with children must have refcount > 0"
+            );
+
+            self.leaves.insert((node.last_accessed, node_id));
+        }
+
+        Ok(())
+    }
+
+    /// Increase the reference count of a node.
+    pub fn incref(&mut self, node_id: NodeId) -> Result<(), TrieError> {
+        if node_id == self.root {
+            return Ok(());
+        }
+
+        let node = self
+            .nodes
+            .get_mut(node_id)
+            .ok_or(TrieError::InvalidNodeId)?;
+        if node.ref_count == 0 {
+            self.leaves.remove(&(node.last_accessed, node_id));
+        }
+        node.ref_count += 1;
+
+        Ok(())
+    }
+
+    /// Evict `n_blocks` from the trie.
+    ///
+    /// Returns the evicted blocks. When the length is less than `n_blocks`,
+    /// not enough blocks could be evicted.
+    pub fn evict(&mut self, n_blocks: usize) -> Vec<u32> {
+        // NOTE: we don't return Result here. If any of the unwrapping fails,
+        // it's a programming error in the trie implementation, not a user
+        // error caused by e.g. an invalid argument.
+
+        // TODO: add some bookkeeping in the future to check whether we can
+        // evict n_blocks and return `None` if we can't. We are now needlessly
+        // evicting prefixes from the cache in such a case.
+        let mut evicted = Vec::new();
+        tracing::debug!("Evicting in search of {n_blocks}");
+
+        while let Some((last_access, node_id)) = self.leaves.pop_first() {
+            let blocks_needed = n_blocks.saturating_sub(evicted.len());
+            tracing::debug!("Evicting node {node_id:?} ");
+
+            let node = self.nodes.get(node_id).expect("Leave does not exist");
+            assert_eq!(
+                node.ref_count, 0,
+                "Leaf must have refcount of 0, got {}",
+                node.ref_count
+            );
+
+            if blocks_needed >= node.blocks.len() {
+                // We need to evict the whole node if we need more blocks than it has.
+                let node = self.remove_node(node_id);
+                evicted.extend(node.blocks);
+
+                if evicted.len() >= n_blocks {
+                    break;
+                }
+            } else {
+                // The node has more blocks than needed, so we'll just remove
+                // the required number of blocks and leave the remaining blocks
+                // untouched.
+                let node = self.nodes.get_mut(node_id).expect("Leave does not exist");
+
+                let truncate_blocks = node.blocks.len() - blocks_needed;
+                let truncate_tokens = truncate_blocks * self.block_size;
+                node.key.truncate(truncate_tokens);
+                evicted.extend(node.blocks.split_off(truncate_blocks));
+                self.leaves.insert((last_access, node_id));
+                break;
+            }
+        }
+
+        evicted
+    }
+
+    /// Insert a prefill along with its blocks.
+    ///
+    /// This method returns the length of the prefix that was already
+    /// in the trie. E.g. if the length is 10, this means that for
+    /// the first 10 elements of the tree **the blocks are not updated**.
+    pub fn insert(&mut self, tokens: &[u32], blocks: &[u32]) -> Result<usize, TrieError> {
+        self.time += 1;
+        let common = self.insert_(self.root, tokens, blocks)?;
+        Ok(common)
+    }
+
+    /// Insertion worker.
+    fn insert_(
+        &mut self,
+        node_id: NodeId,
+        tokens: &[u32],
+        blocks: &[u32],
+    ) -> Result<usize, TrieError> {
+        // TODO: in the future we may want to check that the blocks match for
+        // the part of the prefix that is already in the trie to detect
+        // mismatches.
+
+        assert_eq!(tokens.len(), blocks.len() * self.block_size);
+
+        let node_key = hash(&tokens[..self.block_size]);
+        if let Some(&child_id) = self.nodes[node_id].children.get(&node_key) {
+            self.update_access_time(child_id);
+            let child = self
+                .nodes
+                .get_mut(child_id)
+                // Unwrap here, since failure is a bug.
+                .expect("Child node does not exist");
+            let shared_prefix_len = shared_prefix(&child.key, tokens, self.block_size);
+
+            // We are done, the prefix is already in the trie.
+            if shared_prefix_len == tokens.len() || shared_prefix_len == 0 {
+                return Ok(shared_prefix_len);
+            }
+
+            // The node's prefix is a prefix of the insertion prefix.
+            if shared_prefix_len == child.key.len() {
+                return Ok(shared_prefix_len
+                    + self.insert_(
+                        child_id,
+                        &tokens[shared_prefix_len..],
+                        &blocks[shared_prefix_len / self.block_size..],
+                    )?);
+            }
+
+            // The node's prefix and the insertion prefix only match partially,
+            // split the node to just contain the matching part. Then insert the
+            // remainder of the prefix into the node again
+            let child_id = self.split_node(child_id, shared_prefix_len);
+            let key = &tokens[shared_prefix_len..];
+            let blocks = &blocks[shared_prefix_len / self.block_size..];
+            Ok(shared_prefix_len + self.insert_(child_id, key, blocks)?)
+        } else {
+            self.add_node(node_id, tokens, blocks);
+            Ok(0)
+        }
+    }
+
+    fn split_node(&mut self, node_id: NodeId, prefix_len: usize) -> NodeId {
+        // We have to make the current node a child to ensure that its
+        // properties and node id stay the same.
+
+        // This funcion unwraps, an  invalid node_id is a programming error.
+
+        let node = self
+            .nodes
+            .get_mut(node_id)
+            .expect("Node to-be split does not exist");
+        let mut parent_key = node.key.split_off(prefix_len);
+        let prefix_blocks = prefix_len / self.block_size;
+        let mut parent_blocks = node.blocks.split_off(prefix_blocks);
+
+        // Move first part of the prefix to the parent. We swap to avoid
+        // an allocation + copy for both splits of the key/blocks.
+        std::mem::swap(&mut node.key, &mut parent_key);
+        std::mem::swap(&mut node.blocks, &mut parent_blocks);
+
+        let node_key = hash(&node.key[..self.block_size]);
+
+        let grandparent_id = node.parent.expect("Node does not have a parent");
+        let parent_id = self.add_node(grandparent_id, parent_key, parent_blocks);
+        self.add_node_to_parent(parent_id, node_key, node_id);
+
+        // Reborrow to make the borrow checker happy.
+        let node = self
+            .nodes
+            .get_mut(node_id)
+            .expect("Node to-be split does not exist");
+        node.parent = Some(parent_id);
+
+        parent_id
+    }
+
+    /// Create a node and add it to the parent.
+    fn add_node(
+        &mut self,
+        parent_id: NodeId,
+        key: impl Into<Vec<u32>>,
+        blocks: impl Into<Vec<u32>>,
+    ) -> NodeId {
+        let key = key.into();
+        let blocks = blocks.into();
+        let first = hash(&key[..self.block_size]);
+
+        let child = TrieNode::new(key, blocks, self.time, Some(parent_id));
+        let child_id = self.nodes.insert(child);
+
+        self.add_node_to_parent(parent_id, first, child_id);
+        self.leaves.insert((self.time, child_id));
+
+        child_id
+    }
+
+    /// Add a node to the parent.
+    fn add_node_to_parent(&mut self, parent_id: NodeId, hash: u64, child_id: NodeId) {
+        // Unwrap here, passing in an unknown id is a programming error.
+        let parent = self.nodes.get_mut(parent_id).expect("Unknown parent node");
+        if parent.children.insert(hash, child_id).is_none() {
+            // Only increase reference count if child does not replace another child.
+            self.incref(parent_id)
+                .expect("Failed to increase parent refcount");
+        }
+    }
+
+    /// Remove a node from the trie.
+    fn remove_node(&mut self, node_id: NodeId) -> TrieNode {
+        // Unwrap here, passing in an unknown id is a programming error.
+        let node = self.nodes.remove(node_id).expect("Unknown node");
+        assert!(
+            node.children.is_empty(),
+            "Tried to remove a node with {} children",
+            node.children.len()
+        );
+        let parent_id = node.parent.expect("Attempted to remove root node");
+        let parent = self.nodes.get_mut(parent_id).expect("Unknown parent node");
+
+        let node_key = hash(&node.key[..self.block_size]);
+        parent.children.remove(&node_key);
+        self.decref(parent_id)
+            .expect("Failed to decrease parent refcount");
+        node
+    }
+
+    fn update_access_time(&mut self, node_id: NodeId) {
+        // Unwrap here, passing in an unknown id is a programming error.
+        let node = self.nodes.get_mut(node_id).expect("Unknown node");
+
+        // Update the ordered leaves set if the node is a leave.
+        if self.leaves.remove(&(node.last_accessed, node_id)) {
+            self.leaves.insert((self.time, node_id));
+        }
+
+        node.last_accessed = self.time;
+    }
+
+    #[allow(dead_code)]
+    #[doc(hidden)]
+    /// Print debugging output for the trie.
+    ///
+    /// In contrast to `Debug` nicely formatted.
+    pub fn print_debug(&self) {
+        self.print_debug_(self.root, 0);
+    }
+
+    fn print_debug_(&self, node_id: NodeId, indent: usize) {
+        let node = &self.nodes[node_id];
+        eprintln!(
+            "{}{:?}, key: {:?}, blocks: {:?}, ref_count: {}, last_accessed: {}, parent: {:?}, children: {:?}",
+            " ".repeat(indent),
+            node_id,
+            node.key,
+            node.blocks,
+            node.ref_count,
+            node.last_accessed,
+            node.parent,
+            node.children
+        );
+        for child_id in self.nodes[node_id].children.values() {
+            self.print_debug_(*child_id, indent + 2);
+        }
+    }
+
+    pub(crate) fn root_id(&self) -> DefaultKey {
+        self.root
+    }
+}
+
+/// Trie node.
+#[derive(Debug)]
+struct TrieNode {
+    blocks: Vec<u32>,
+    children: HashMap<u64, NodeId>,
+    key: Vec<u32>,
+    last_accessed: u64,
+    parent: Option<NodeId>,
+    ref_count: usize,
+}
+
+impl TrieNode {
+    fn new(key: Vec<u32>, blocks: Vec<u32>, last_accessed: u64, parent: Option<NodeId>) -> Self {
+        TrieNode {
+            children: HashMap::new(),
+            key,
+            blocks,
+            last_accessed,
+            parent,
+            ref_count: 0,
+        }
+    }
+}
+
+fn shared_prefix(left: &[u32], right: &[u32], block_size: usize) -> usize {
+    let full = left.iter().zip(right).take_while(|(a, b)| a == b).count();
+    // NOTE: this is the case because the child node was chosen based on
+    //       matching the first character of the key/prefix.
+    assert!(full > 0, "Prefixes must at least share 1 token");
+    (full / block_size) * block_size
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+
+    #[test]
+    fn allocator_block_size() {
+        let mut cache = RadixAllocator::new(2, 12, None);
+        let allocation = cache.allocate(8, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
+        assert_eq!(allocation.blocks, vec![8, 9, 10, 11]);
+        assert_eq!(allocation.slots, vec![16, 17, 18, 19, 20, 21, 22, 23]);
+        assert_eq!(allocation.prefix_len, 0);
+        cache.free(allocation.blocks.clone(), allocation.allocation_id);
+
+        let allocation = cache.allocate(8, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
+        assert_eq!(allocation.blocks, vec![8, 9, 10, 11]);
+        assert_eq!(allocation.slots, vec![16, 17, 18, 19, 20, 21, 22, 23]);
+        assert_eq!(allocation.prefix_len, 4);
+    }
+
+    #[test]
+    fn allocator_block_size_non_aligned() {
+        let mut cache = RadixAllocator::new(2, 12, None);
+        let allocation = cache.allocate(7, Some(Arc::new(vec![0, 1, 2]))).unwrap();
+        assert_eq!(allocation.blocks, vec![8, 9, 10, 11]);
+        assert_eq!(allocation.slots, vec![16, 17, 18, 19, 20, 21, 22]);
+        assert_eq!(allocation.prefix_len, 0);
+        cache.free(allocation.blocks.clone(), allocation.allocation_id);
+
+        let allocation = cache.allocate(7, Some(Arc::new(vec![0, 1, 2]))).unwrap();
+        assert_eq!(allocation.blocks, vec![8, 9, 10, 11]);
+        assert_eq!(allocation.slots, vec![16, 17, 18, 19, 20, 21, 22]);
+        assert_eq!(allocation.prefix_len, 2);
+    }
+
+    #[test]
+    fn allocator_reuses_prefixes() {
+        let mut cache = RadixAllocator::new(1, 12, None);
+        let allocation = cache.allocate(8, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
+        assert_eq!(allocation.blocks, vec![4, 5, 6, 7, 8, 9, 10, 11]);
+        assert_eq!(allocation.blocks, allocation.slots);
+        assert_eq!(allocation.prefix_len, 0);
+        cache.free(allocation.blocks.clone(), allocation.allocation_id);
+
+        let allocation = cache.allocate(8, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
+        assert_eq!(allocation.blocks, vec![4, 5, 6, 7, 8, 9, 10, 11]);
+        assert_eq!(allocation.prefix_len, 4);
+    }
+
+    #[test]
+    fn allocator_collects_older_prefixes_first() {
+        let mut cache = RadixAllocator::new(1, 7, None);
+        let allocation1 = cache.allocate(4, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
+        assert_eq!(allocation1.blocks, vec![3, 4, 5, 6]);
+        assert_eq!(allocation1.prefix_len, 0);
+
+        let allocation2 = cache.allocate(2, Some(Arc::new(vec![4, 5]))).unwrap();
+        assert_eq!(allocation2.blocks, vec![1, 2]);
+        assert_eq!(allocation2.prefix_len, 0);
+
+        cache.free(allocation1.blocks.clone(), allocation1.allocation_id);
+        cache.free(allocation2.blocks.clone(), allocation2.allocation_id);
+
+        // We should get the blocks of the first allocation, since they are more recent.
+        let allocation3 = cache.allocate(4, Some(Arc::new(vec![6, 7, 8, 9]))).unwrap();
+        assert_eq!(allocation3.blocks, vec![3, 4, 5, 6]);
+        assert_eq!(allocation3.prefix_len, 0);
+    }
+
+    #[test]
+    fn allocator_frees_fully_overlapping_prefills() {
+        let mut cache = RadixAllocator::new(1, 10, None);
+        let allocation1 = cache.allocate(4, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
+        let allocation2 = cache.allocate(4, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
+
+        cache.free(allocation2.blocks.clone(), allocation2.allocation_id);
+        cache.free(allocation1.blocks.clone(), allocation1.allocation_id);
+
+        let allocation3 = cache.allocate(4, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
+        assert_eq!(allocation3.prefix_len, 4);
+
+        // 10 blocks, of which 1 reserved for health checks, 4 for the cached blocks.
+        assert_eq!(cache.free_blocks.len(), 5);
+    }
+
+    #[test]
+    fn allocator_frees_partially_overlapping_prefills() {
+        let mut cache = RadixAllocator::new(1, 20, None);
+        let allocation1 = cache.allocate(4, Some(Arc::new(vec![0, 1]))).unwrap();
+        assert_eq!(allocation1.blocks, vec![16, 17, 18, 19]);
+        assert_eq!(allocation1.prefix_len, 0);
+
+        cache.free(allocation1.blocks.clone(), allocation1.allocation_id);
+
+        let allocation2 = cache
+            .allocate(8, Some(Arc::new(vec![0, 1, 2, 3, 4, 5])))
+            .unwrap();
+        assert_eq!(allocation2.blocks, vec![16, 17, 12, 13, 14, 15, 18, 19]);
+        assert_eq!(allocation2.prefix_len, 2);
+
+        let allocation3 = cache
+            .allocate(8, Some(Arc::new(vec![0, 1, 2, 3, 6, 7])))
+            .unwrap();
+        assert_eq!(allocation3.blocks, vec![16, 17, 6, 7, 8, 9, 10, 11]);
+        assert_eq!(allocation3.prefix_len, 2);
+
+        cache.free(allocation3.blocks.clone(), allocation3.allocation_id);
+        cache.free(allocation2.blocks.clone(), allocation2.allocation_id);
+
+        // 20 blocks, of which 1 reserved for health checks, 6 for allocation3, 2 for allocation2.
+        assert_eq!(cache.free_blocks.len(), 11);
+
+        let allocation4 = cache
+            .allocate(6, Some(Arc::new(vec![0, 1, 2, 3, 4, 5])))
+            .unwrap();
+        assert_eq!(allocation4.blocks, vec![16, 17, 6, 7, 14, 15]);
+        assert_eq!(allocation4.prefix_len, 6);
+        assert_eq!(cache.free_blocks.len(), 11);
+
+        let allocation5 = cache
+            .allocate(6, Some(Arc::new(vec![0, 1, 2, 3, 6, 7])))
+            .unwrap();
+        assert_eq!(allocation5.blocks, vec![16, 17, 6, 7, 8, 9]);
+        assert_eq!(allocation5.prefix_len, 6);
+        assert_eq!(cache.free_blocks.len(), 11);
+    }
+
+    #[test]
+    fn trie_insertions_have_correct_prefix_len() {
+        let mut trie = RadixTrie::new(1);
+
+        assert_eq!(trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap(), 0);
+
+        // Already exists.
+        assert_eq!(trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap(), 3);
+
+        // Completely new at root-level
+        assert_eq!(trie.insert(&[1, 2, 3], &[1, 2, 3]).unwrap(), 0);
+
+        // Contains full prefix, but longer.
+        assert_eq!(trie.insert(&[0, 1, 2, 3, 4], &[0, 1, 2, 3, 4]).unwrap(), 3);
+
+        // Shares partial prefix, we need a split.
+        assert_eq!(
+            trie.insert(&[0, 1, 2, 3, 5, 6, 7], &[0, 1, 2, 3, 5, 6, 7])
+                .unwrap(),
+            4
+        );
+    }
+
+    #[test]
+    fn trie_insertions_block_size() {
+        let mut trie = RadixTrie::new(2);
+
+        assert_eq!(trie.insert(&[0, 1, 2, 3], &[0, 1]).unwrap(), 0);
+
+        // Already exists.
+        // But needs to be block_size aligned
+        assert_eq!(trie.insert(&[0, 1, 2, 3], &[0, 1]).unwrap(), 4);
+
+        // Completely new at root-level
+        assert_eq!(trie.insert(&[1, 2, 3, 4], &[1, 2]).unwrap(), 0);
+
+        // Contains full prefix, but longer.
+        assert_eq!(trie.insert(&[0, 1, 2, 3, 4, 5], &[0, 1, 2]).unwrap(), 4);
+
+        // Shares partial prefix, we need a split.
+        assert_eq!(
+            trie.insert(&[0, 1, 3, 4, 5, 6, 7, 8], &[0, 1, 2, 3])
+                .unwrap(),
+            2
+        );
+    }
+
+    #[test]
+    fn trie_get_returns_correct_blocks() {
+        let mut trie = RadixTrie::new(1);
+        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
+        trie.insert(&[1, 2, 3], &[1, 2, 3]).unwrap();
+        trie.insert(&[0, 1, 2, 3, 4], &[0, 1, 2, 3, 4]).unwrap();
+        trie.insert(&[0, 1, 2, 3, 5, 6, 7], &[0, 1, 2, 3, 5, 6, 7])
+            .unwrap();
+
+        let mut blocks = Vec::new();
+        trie.find(&[0], &mut blocks);
+        assert_eq!(blocks, vec![0]);
+
+        blocks.clear();
+        trie.find(&[0, 1, 2], &mut blocks);
+        assert_eq!(blocks, vec![0, 1, 2]);
+
+        blocks.clear();
+        trie.find(&[1, 2, 3], &mut blocks);
+        assert_eq!(blocks, vec![1, 2, 3]);
+
+        blocks.clear();
+        trie.find(&[0, 1, 2, 3], &mut blocks);
+        assert_eq!(blocks, vec![0, 1, 2, 3]);
+
+        blocks.clear();
+        trie.find(&[0, 1, 2, 3, 4], &mut blocks);
+        assert_eq!(blocks, vec![0, 1, 2, 3, 4]);
+
+        blocks.clear();
+        trie.find(&[0, 1, 2, 3, 5], &mut blocks);
+        assert_eq!(blocks, vec![0, 1, 2, 3, 5]);
+    }
+
+    #[test]
+    fn trie_evict_removes_correct_blocks() {
+        let mut trie = RadixTrie::new(1);
+        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
+        trie.insert(&[0, 1, 2, 3, 5, 6, 7], &[0, 1, 2, 3, 5, 6, 7])
+            .unwrap();
+        trie.insert(&[0, 1, 2, 3, 4], &[0, 1, 2, 3, 4]).unwrap();
+        trie.insert(&[1, 2, 3], &[1, 2, 3]).unwrap();
+
+        let mut blocks = Vec::new();
+
+        // Remove less than the leave blocks.
+        assert_eq!(trie.evict(1), vec![7]);
+        trie.find(&[0, 1, 2, 3, 5, 6, 7], &mut blocks);
+        assert_eq!(blocks, vec![0, 1, 2, 3, 5, 6]);
+
+        // Refresh other leaf.
+        trie.find(&[0, 1, 2, 3, 4], &mut blocks);
+        trie.find(&[1, 2, 3], &mut blocks);
+
+        // Remove the leave blocks exactly.
+        assert_eq!(trie.evict(2), vec![5, 6]);
+        blocks.clear();
+        trie.find(&[0, 1, 2, 3, 5, 6, 7], &mut blocks);
+        assert_eq!(blocks, vec![0, 1, 2, 3]);
+
+        trie.find(&[1, 2, 3], &mut blocks);
+
+        // Remove more than the leave blocks.
+        assert_eq!(trie.evict(3), vec![4, 3, 2]);
+        blocks.clear();
+        trie.find(&[0, 1, 2, 3, 4], &mut blocks);
+        assert_eq!(blocks, vec![0, 1]);
+
+        // Clear out the whole trie.
+        assert_eq!(trie.evict(10), vec![1, 2, 3, 0, 1]);
+    }
+}
diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml
index 756460e0a3caa6bf3bdee03085a0a36a39189482..23885da29f91e79fbb535750480eb00a0065c980 100644
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@@ -16,16 +16,15 @@ path = "src/main.rs"
 [dependencies]
 average = "0.14"
 clap = { version = "4.4.5", features = ["derive", "env"] }
-crossterm = "0.27"
 float-ord = "0.3.2"
 serde = {version = "1.0.188", features = ["derive"]}
 serde_json = "1.0"
 tabled = "0.14.0"
-text-generation-client = { path = "../router/client" }
+text-generation-client = { path = "../backends/client" }
 thiserror = "1.0.48"
 tokenizers = { workspace = true }
 tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
-tui = {package = "ratatui", version = "0.23", default-features = false, features = ["crossterm"]}
+ratatui = "0.28.1"
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
 hf-hub = { workspace = true }
diff --git a/benchmark/README.md b/benchmark/README.md
index 17a02a30c641d80f8fd836c0cdd62cdba9df5d46..f4e0cb168c562a65ebefdd7ab9371d2c81b210ee 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -7,7 +7,7 @@
 </div>
 
 A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha)
-and powered by [tui](https://github.com/tui-rs-revival/ratatui).
+and powered by [Ratatui](https://github.com/ratatui/ratatui).
 
 ## Install
 
diff --git a/benchmark/src/app.rs b/benchmark/src/app.rs
index a0a9313a198fc052471aeb2d4eda673574d66b8a..7e3aeaf9ecb9dd44b79d0fa3d729d5d024a626b9 100644
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@@ -1,16 +1,15 @@
 /// Inspired by https://github.com/hatoo/oha/blob/bb989ea3cd77727e7743e7daa60a19894bb5e901/src/monitor.rs
 use crate::generation::{Decode, Message, Prefill};
-use crossterm::event::{KeyCode, KeyEvent, KeyModifiers};
-use text_generation_client::ClientError;
-use tokio::sync::mpsc;
-use tui::backend::Backend;
-use tui::layout::{Alignment, Constraint, Direction, Layout};
-use tui::style::{Color, Modifier, Style};
-use tui::text::{Line, Span};
-use tui::widgets::{
+use ratatui::crossterm::event::{KeyCode, KeyEvent, KeyModifiers};
+use ratatui::layout::{Alignment, Constraint, Direction, Layout};
+use ratatui::style::{Color, Modifier, Style};
+use ratatui::text::{Line, Span};
+use ratatui::widgets::{
     Axis, BarChart, Block, Borders, Chart, Dataset, Gauge, GraphType, Paragraph, Tabs,
 };
-use tui::{symbols, Frame};
+use ratatui::{symbols, Frame};
+use text_generation_client::ClientError;
+use tokio::sync::mpsc;
 
 /// TUI powered App
 pub(crate) struct App {
@@ -153,7 +152,7 @@ impl App {
     }
 
     /// Render frame
-    pub fn render<B: Backend>(&mut self, f: &mut Frame<'_, B>) {
+    pub fn render(&mut self, f: &mut Frame) {
         let batch_progress =
             (self.completed_batch as f64 / self.data.batch_size.len() as f64).clamp(0.0, 1.0);
         let run_progress =
@@ -172,7 +171,7 @@ impl App {
                 ]
                 .as_ref(),
             )
-            .split(f.size());
+            .split(f.area());
 
         // Top row horizontal layout
         let top = Layout::default()
@@ -239,7 +238,7 @@ impl App {
         f.render_widget(helper, row5[0]);
 
         // Batch tabs
-        let titles = self
+        let titles: Vec<Line> = self
             .data
             .batch_size
             .iter()
diff --git a/benchmark/src/event.rs b/benchmark/src/event.rs
index 07482aed8de507d0a79f59cea418f017f88fef9e..d3f10fb6fc6bf6a1531acfa12784c07fbcb5c13d 100644
--- a/benchmark/src/event.rs
+++ b/benchmark/src/event.rs
@@ -1,5 +1,5 @@
 /// Inspired by https://github.com/orhun/rust-tui-template/blob/472aa515119d4c94903eac12d9784417281dc7f5/src/event.rs
-use crossterm::event;
+use ratatui::crossterm::event;
 use std::time::{Duration, Instant};
 use tokio::sync::{broadcast, mpsc};
 
diff --git a/benchmark/src/generation.rs b/benchmark/src/generation.rs
index 5e739703fbf36cd7e8d718b4f66eba4aed5cfc8e..63fc780818ba6cc1c76b77f1f9f9793eaf810df2 100644
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@@ -148,6 +148,7 @@ async fn prefill(
             }),
             inputs: sequence.clone(),
             truncate: sequence_length,
+            add_special_tokens: true,
             parameters: Some(parameters.clone()),
             stopping_parameters: Some(StoppingCriteriaParameters {
                 max_new_tokens: decode_length,
@@ -157,6 +158,8 @@ async fn prefill(
             top_n_tokens: top_n_tokens.unwrap_or(0),
             blocks: vec![],
             slots: vec![],
+            cache_len: 0,
+            chunk_len: None,
             adapter_id: None,
         })
         .collect();
@@ -171,7 +174,7 @@ async fn prefill(
 
     // Run prefill
     let start_time = Instant::now();
-    let (_, decode_batch, _) = client.prefill(batch.clone()).await?;
+    let (_, decode_batch, _) = client.prefill(batch.clone(), None).await?;
 
     // Get latency
     let latency = start_time.elapsed();
diff --git a/benchmark/src/lib.rs b/benchmark/src/lib.rs
index c33d64e673ef31b39fab8af66061c7acfc2ad64a..bb4b6a77f9f8d50769997a23fba6755c74a725c1 100644
--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@@ -6,13 +6,13 @@ mod utils;
 
 use crate::app::App;
 use crate::event::Event;
-use crossterm::ExecutableCommand;
+use ratatui::backend::CrosstermBackend;
+use ratatui::crossterm::ExecutableCommand;
+use ratatui::Terminal;
 use std::io;
 use text_generation_client::v3::{GrammarType, NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
-use tui::backend::CrosstermBackend;
-use tui::Terminal;
 
 /// Run benchmarking app
 #[allow(clippy::too_many_arguments)]
@@ -50,9 +50,9 @@ pub async fn run(
     };
 
     // Initialize terminal properties
-    crossterm::terminal::enable_raw_mode()?;
-    io::stdout().execute(crossterm::terminal::EnterAlternateScreen)?;
-    io::stdout().execute(crossterm::cursor::Hide)?;
+    ratatui::crossterm::terminal::enable_raw_mode()?;
+    io::stdout().execute(ratatui::crossterm::terminal::EnterAlternateScreen)?;
+    io::stdout().execute(ratatui::crossterm::cursor::Hide)?;
 
     // Initialize terminal
     let mut terminal = {
@@ -128,9 +128,9 @@ pub async fn run(
     let _ = shutdown_guard_receiver.recv().await;
 
     // Revert terminal to original view
-    io::stdout().execute(crossterm::terminal::LeaveAlternateScreen)?;
-    crossterm::terminal::disable_raw_mode()?;
-    io::stdout().execute(crossterm::cursor::Show)?;
+    io::stdout().execute(ratatui::crossterm::terminal::LeaveAlternateScreen)?;
+    ratatui::crossterm::terminal::disable_raw_mode()?;
+    io::stdout().execute(ratatui::crossterm::cursor::Show)?;
 
     let parameters_table = table::parameters_table(
         tokenizer_name,
diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index 2ee3d7c551a41169df1ce86a83ce826ad644f011..2e2e9a11c12860fed0d1f3ae1e9e645ac20ab573 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -178,6 +178,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 .clear_cache(None)
                 .await
                 .expect("Unable to clear cache");
+
             tracing::info!("Connected");
 
             // Run app
diff --git a/clients/python/README.md b/clients/python/README.md
index bf37508e070dd4b5238a8886dd0f1652dd046bae..88239aa16cc65f4ce6cf3510eacabd8746af2184 100644
--- a/clients/python/README.md
+++ b/clients/python/README.md
@@ -1,3 +1,6 @@
+# Legacy warning ⚠️
+The inference clients from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference) are recommended over `text_generation`.
+
 # Text Generation
 
 The Hugging Face Text Generation Python library provides a convenient way of interfacing with a
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 2925085b6a4d2b5a446ffce0f22d17d5f0ba434f..47ef9d717844a51b98241bfe597d9358f4c7e86a 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -27,3 +27,6 @@ asyncio_mode = "auto"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.isort]
+profile = "black"
diff --git a/clients/python/tests_native/conftest.py b/clients/python/tests_native/conftest.py
deleted file mode 100644
index 954787d871ccb6a0129030398a1397fcfee17e2b..0000000000000000000000000000000000000000
--- a/clients/python/tests_native/conftest.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import pytest
-
-from text_generation import __version__
-from huggingface_hub.utils import build_hf_headers
-
-
-@pytest.fixture
-def flan_t5_xxl():
-    return "google/flan-t5-xxl"
-
-
-@pytest.fixture
-def llama_7b():
-    return "meta-llama/Llama-2-7b-chat-hf"
-
-
-@pytest.fixture
-def fake_model():
-    return "fake/model"
-
-
-@pytest.fixture
-def unsupported_model():
-    return "gpt2"
-
-
-@pytest.fixture
-def base_url():
-    return "https://api-inference.huggingface.co/models"
-
-
-@pytest.fixture
-def bloom_url(base_url, bloom_model):
-    return f"{base_url}/{bloom_model}"
-
-
-@pytest.fixture
-def flan_t5_xxl_url(base_url, flan_t5_xxl):
-    return f"{base_url}/{flan_t5_xxl}"
-
-
-@pytest.fixture
-def llama_7b_url(base_url, llama_7b):
-    # return f"{base_url}/{llama_7b}"
-    return "http://localhost:3001"
-
-
-@pytest.fixture
-def fake_url(base_url, fake_model):
-    return f"{base_url}/{fake_model}"
-
-
-@pytest.fixture
-def unsupported_url(base_url, unsupported_model):
-    return f"{base_url}/{unsupported_model}"
-
-
-@pytest.fixture(scope="session")
-def hf_headers():
-    # return build_hf_headers(
-    #     library_name="text-generation-tests", library_version=__version__
-    # )
-    header = {'content-type': 'application/json'}
-    return header
diff --git a/clients/python/tests_native/test_client.py b/clients/python/tests_native/test_client.py
deleted file mode 100644
index 9c6cb962bad3e1f7b7ff3d0aced855001ed98ce8..0000000000000000000000000000000000000000
--- a/clients/python/tests_native/test_client.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import pytest
-
-from text_generation import Client, AsyncClient
-from text_generation.errors import NotFoundError, ValidationError
-from text_generation.types import FinishReason, InputToken
-
-
-def test_generate(llama_7b_url, hf_headers):
-    client = Client(llama_7b_url, hf_headers)
-    response = client.generate("test", max_new_tokens=1, decoder_input_details=True)
-
-    assert response.generated_text == "_"
-    assert response.details.finish_reason == FinishReason.Length
-    assert response.details.generated_tokens == 1
-    assert response.details.seed is None
-    assert len(response.details.prefill) == 2
-    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
-    assert len(response.details.tokens) == 1
-    assert response.details.tokens[0].id == 29918
-    assert response.details.tokens[0].text == "_"
-    assert not response.details.tokens[0].special
-
-
-def test_generate_best_of(llama_7b_url, hf_headers):
-    client = Client(llama_7b_url, hf_headers)
-    response = client.generate(
-        "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
-    )
-
-    assert response.details.seed is not None
-    assert response.details.best_of_sequences is not None
-    assert len(response.details.best_of_sequences) == 1
-    assert response.details.best_of_sequences[0].seed is not None
-
-def test_generate_validation_error(llama_7b_url, hf_headers):
-    client = Client(llama_7b_url, hf_headers)
-    with pytest.raises(ValidationError):
-        client.generate("test", max_new_tokens=10_000)
-
-
-def test_generate_stream(llama_7b_url, hf_headers):
-    client = Client(llama_7b_url, hf_headers)
-    responses = [
-        response for response in client.generate_stream("test", max_new_tokens=1)
-    ]
-
-    assert len(responses) == 1
-    response = responses[0]
-
-    assert response.generated_text == "_"
-    assert response.details.finish_reason == FinishReason.Length
-    assert response.details.generated_tokens == 1
-    assert response.details.seed is None
-
-
-def test_generate_stream_validation_error(llama_7b_url, hf_headers):
-    client = Client(llama_7b_url, hf_headers)
-    with pytest.raises(ValidationError):
-        list(client.generate_stream("test", max_new_tokens=10_000))
-
-
-@pytest.mark.asyncio
-async def test_generate_async(llama_7b_url, hf_headers):
-    client = AsyncClient(llama_7b_url, hf_headers)
-    response = await client.generate(
-        "test", max_new_tokens=1, decoder_input_details=True
-    )
-
-    assert response.generated_text == "_"
-    assert response.details.finish_reason == FinishReason.Length
-    assert response.details.generated_tokens == 1
-    assert response.details.seed is None
-    assert len(response.details.prefill) == 2
-    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
-    assert response.details.prefill[1] == InputToken(
-        id=1243, text="test", logprob=-10.9375
-    )
-    assert len(response.details.tokens) == 1
-    assert response.details.tokens[0].id == 29918
-    assert response.details.tokens[0].text == "_"
-    assert not response.details.tokens[0].special
-
-
-@pytest.mark.asyncio
-async def test_generate_async_best_of(llama_7b_url, hf_headers):
-    client = AsyncClient(llama_7b_url, hf_headers)
-    response = await client.generate(
-        "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
-    )
-
-    assert response.details.seed is not None
-    assert response.details.best_of_sequences is not None
-    assert len(response.details.best_of_sequences) == 1
-    assert response.details.best_of_sequences[0].seed is not None
-
-
-@pytest.mark.asyncio
-async def test_generate_async_validation_error(llama_7b_url, hf_headers):
-    client = AsyncClient(llama_7b_url, hf_headers)
-    with pytest.raises(ValidationError):
-        await client.generate("test", max_new_tokens=10_000)
-
-
-@pytest.mark.asyncio
-async def test_generate_stream_async(llama_7b_url, hf_headers):
-    client = AsyncClient(llama_7b_url, hf_headers)
-    responses = [
-        response async for response in client.generate_stream("test", max_new_tokens=1)
-    ]
-
-    assert len(responses) == 1
-    response = responses[0]
-
-    assert response.generated_text == "_"
-    assert response.details.finish_reason == FinishReason.Length
-    assert response.details.generated_tokens == 1
-    assert response.details.seed is None
-
-
-@pytest.mark.asyncio
-async def test_generate_stream_async_validation_error(llama_7b_url, hf_headers):
-    client = AsyncClient(llama_7b_url, hf_headers)
-    with pytest.raises(ValidationError):
-        async for _ in client.generate_stream("test", max_new_tokens=10_000):
-            pass
diff --git a/clients/python/tests_native/test_errors.py b/clients/python/tests_native/test_errors.py
deleted file mode 100644
index 8389ed313812a8e56bef21a5b4437d55911469f3..0000000000000000000000000000000000000000
--- a/clients/python/tests_native/test_errors.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from text_generation.errors import (
-    parse_error,
-    GenerationError,
-    IncompleteGenerationError,
-    OverloadedError,
-    ValidationError,
-    BadRequestError,
-    ShardNotReadyError,
-    ShardTimeoutError,
-    NotFoundError,
-    RateLimitExceededError,
-    UnknownError,
-)
-
-
-def test_generation_error():
-    payload = {"error_type": "generation", "error": "test"}
-    assert isinstance(parse_error(400, payload), GenerationError)
-
-
-def test_incomplete_generation_error():
-    payload = {"error_type": "incomplete_generation", "error": "test"}
-    assert isinstance(parse_error(400, payload), IncompleteGenerationError)
-
-
-def test_overloaded_error():
-    payload = {"error_type": "overloaded", "error": "test"}
-    assert isinstance(parse_error(400, payload), OverloadedError)
-
-
-def test_validation_error():
-    payload = {"error_type": "validation", "error": "test"}
-    assert isinstance(parse_error(400, payload), ValidationError)
-
-
-def test_bad_request_error():
-    payload = {"error": "test"}
-    assert isinstance(parse_error(400, payload), BadRequestError)
-
-
-def test_shard_not_ready_error():
-    payload = {"error": "test"}
-    assert isinstance(parse_error(403, payload), ShardNotReadyError)
-    assert isinstance(parse_error(424, payload), ShardNotReadyError)
-
-
-def test_shard_timeout_error():
-    payload = {"error": "test"}
-    assert isinstance(parse_error(504, payload), ShardTimeoutError)
-
-
-def test_not_found_error():
-    payload = {"error": "test"}
-    assert isinstance(parse_error(404, payload), NotFoundError)
-
-
-def test_rate_limit_exceeded_error():
-    payload = {"error": "test"}
-    assert isinstance(parse_error(429, payload), RateLimitExceededError)
-
-
-def test_unknown_error():
-    payload = {"error": "test"}
-    assert isinstance(parse_error(500, payload), UnknownError)
diff --git a/clients/python/tests_native/test_types.py b/clients/python/tests_native/test_types.py
deleted file mode 100644
index 77689ade7c9f9f0ac0ca8f3772e0d71ce31b7b0a..0000000000000000000000000000000000000000
--- a/clients/python/tests_native/test_types.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import pytest
-
-from text_generation.types import Parameters, Request
-from text_generation.errors import ValidationError
-
-
-def test_parameters_validation():
-    # Test best_of
-    Parameters(best_of=1)
-    with pytest.raises(ValidationError):
-        Parameters(best_of=0)
-    with pytest.raises(ValidationError):
-        Parameters(best_of=-1)
-    Parameters(best_of=2, do_sample=True)
-    with pytest.raises(ValidationError):
-        Parameters(best_of=2)
-    with pytest.raises(ValidationError):
-        Parameters(best_of=2, seed=1)
-
-    # Test repetition_penalty
-    Parameters(repetition_penalty=1)
-    with pytest.raises(ValidationError):
-        Parameters(repetition_penalty=0)
-    with pytest.raises(ValidationError):
-        Parameters(repetition_penalty=-1)
-
-    # Test seed
-    Parameters(seed=1)
-    with pytest.raises(ValidationError):
-        Parameters(seed=-1)
-
-    # Test temperature
-    Parameters(temperature=1)
-    with pytest.raises(ValidationError):
-        Parameters(temperature=0)
-    with pytest.raises(ValidationError):
-        Parameters(temperature=-1)
-
-    # Test top_k
-    Parameters(top_k=1)
-    with pytest.raises(ValidationError):
-        Parameters(top_k=0)
-    with pytest.raises(ValidationError):
-        Parameters(top_k=-1)
-
-    # Test top_p
-    Parameters(top_p=0.5)
-    with pytest.raises(ValidationError):
-        Parameters(top_p=0)
-    with pytest.raises(ValidationError):
-        Parameters(top_p=-1)
-    with pytest.raises(ValidationError):
-        Parameters(top_p=1)
-
-    # Test truncate
-    Parameters(truncate=1)
-    with pytest.raises(ValidationError):
-        Parameters(truncate=0)
-    with pytest.raises(ValidationError):
-        Parameters(truncate=-1)
-
-    # Test typical_p
-    Parameters(typical_p=0.5)
-    with pytest.raises(ValidationError):
-        Parameters(typical_p=0)
-    with pytest.raises(ValidationError):
-        Parameters(typical_p=-1)
-    with pytest.raises(ValidationError):
-        Parameters(typical_p=1)
-
-
-def test_request_validation():
-    Request(inputs="test")
-
-    with pytest.raises(ValidationError):
-        Request(inputs="")
-
-    Request(inputs="test", stream=True)
-    Request(inputs="test", parameters=Parameters(best_of=2, do_sample=True))
-
-    with pytest.raises(ValidationError):
-        Request(
-            inputs="test", parameters=Parameters(best_of=2, do_sample=True), stream=True
-        )
diff --git a/clients/python/text_generation/__init__.py b/clients/python/text_generation/__init__.py
index d7a09c9ebf6eb01ec4463c44dc9a971236408b17..ca783dcdfa7bd8eeebf4c57c0aa3d0a7e22dfbd2 100644
--- a/clients/python/text_generation/__init__.py
+++ b/clients/python/text_generation/__init__.py
@@ -19,5 +19,15 @@ DEPRECATION_WARNING = (
     "Please use the `InferenceClient` from the `huggingface_hub` package instead."
 )
 
-from text_generation.client import Client, AsyncClient
-from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
+from text_generation.client import Client, AsyncClient  # noqa E402
+from text_generation.inference_api import (  # noqa E402
+    InferenceAPIClient,
+    InferenceAPIAsyncClient,
+)
+
+__all__ = [
+    "Client",
+    "AsyncClient",
+    "InferenceAPIClient",
+    "InferenceAPIAsyncClient",
+]
diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py
index 129667472f7da83d55cecc2582c92924f4efcbb9..45301b6325664d2906b390c3fdb187d409b12766 100644
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@@ -757,7 +757,12 @@ class AsyncClient:
                         continue
                     payload = byte_payload.decode("utf-8")
                     if payload.startswith("data:"):
-                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        payload_data = (
+                            payload.lstrip("data:").rstrip("\n").removeprefix(" ")
+                        )
+                        if payload_data == "[DONE]":
+                            break
+                        json_payload = json.loads(payload_data)
                         try:
                             response = ChatCompletionChunk(**json_payload)
                             yield response
diff --git a/clients/python/text_generation/inference_api.py b/clients/python/text_generation/inference_api.py
index 93b0de8d42c81f2bf2b428239e71eda6f37b3c24..b3b98ed28bf1ac1b948c6b012f6dd3c1ed3c6aa7 100644
--- a/clients/python/text_generation/inference_api.py
+++ b/clients/python/text_generation/inference_api.py
@@ -21,7 +21,7 @@ def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]:
         List[DeployedModel]: list of all currently deployed models
     """
     resp = requests.get(
-        f"https://api-inference.huggingface.co/framework/text-generation-inference",
+        "https://api-inference.huggingface.co/framework/text-generation-inference",
         headers=headers,
         timeout=5,
     )
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index a56edaca75909cb52e0078830d7ee57f2ee4f384..1085075e43803bed472a082ce64759860dfc4626 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -28,11 +28,17 @@ class ToolCall(BaseModel):
     function: dict
 
 
+class Chunk(BaseModel):
+    type: str
+    text: Optional[str] = None
+    image_url: Any = None
+
+
 class Message(BaseModel):
     # Role of the message sender
     role: str
     # Content of the message
-    content: Optional[str] = None
+    content: Optional[Union[str, List[Chunk]]] = None
     # Optional name of the message sender
     name: Optional[str] = None
     # Tool calls associated with the chat completion
@@ -61,7 +67,7 @@ class ChoiceDeltaToolCall(BaseModel):
 class ChoiceDelta(BaseModel):
     role: str
     content: Optional[str] = None
-    tool_calls: Optional[ChoiceDeltaToolCall]
+    tool_calls: Optional[ChoiceDeltaToolCall] = None
 
 
 class Choice(BaseModel):
@@ -168,7 +174,7 @@ class ChatCompletionComplete(BaseModel):
     # Log probabilities for the chat completion
     logprobs: Optional[Any]
     # Reason for completion
-    finish_reason: str
+    finish_reason: Optional[str]
     # Usage details of the chat completion
     usage: Optional[Any] = None
 
@@ -191,6 +197,7 @@ class ChatCompletionChunk(BaseModel):
     model: str
     system_fingerprint: str
     choices: List[Choice]
+    usage: Optional[Any] = None
 
 
 class Parameters(BaseModel):
diff --git a/docs/openapi.json b/docs/openapi.json
index 03e2d4ff26287e1de734590c60cfe6231b1f625f..7eca49637ae3868962e95ea8f86ce999bc7c4331 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "2.1.1"
+    "version": "2.4.0"
   },
   "paths": {
     "/": {
@@ -316,6 +316,98 @@
         }
       }
     },
+    "/invocations": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens from Sagemaker request",
+        "operationId": "sagemaker_compatibility",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/SagemakerRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerResponse"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerStreamResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error",
+                  "error_type": "validation"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/metrics": {
       "get": {
         "tags": [
@@ -492,12 +584,12 @@
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/Completion"
+                  "$ref": "#/components/schemas/CompletionFinal"
                 }
               },
               "text/event-stream": {
                 "schema": {
-                  "$ref": "#/components/schemas/CompletionCompleteChunk"
+                  "$ref": "#/components/schemas/Chunk"
                 }
               }
             }
@@ -556,6 +648,37 @@
           }
         }
       }
+    },
+    "/v1/models": {
+      "get": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Get model info",
+        "operationId": "openai_get_model_info",
+        "responses": {
+          "200": {
+            "description": "Served model info",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ModelInfo"
+                }
+              }
+            }
+          },
+          "404": {
+            "description": "Model not found",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          }
+        }
+      }
     }
   },
   "components": {
@@ -711,6 +834,14 @@
           },
           "system_fingerprint": {
             "type": "string"
+          },
+          "usage": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/Usage"
+              }
+            ],
+            "nullable": true
           }
         }
       },
@@ -809,7 +940,6 @@
       "ChatRequest": {
         "type": "object",
         "required": [
-          "model",
           "messages"
         ],
         "properties": {
@@ -820,6 +950,13 @@
             "example": "1.0",
             "nullable": true
           },
+          "guideline": {
+            "type": "string",
+            "description": "A guideline to be used in the chat_template",
+            "default": "null",
+            "example": "null",
+            "nullable": true
+          },
           "logit_bias": {
             "type": "array",
             "items": {
@@ -854,7 +991,8 @@
           "model": {
             "type": "string",
             "description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
-            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+            "example": "mistralai/Mistral-7B-Instruct-v0.2",
+            "nullable": true
           },
           "n": {
             "type": "integer",
@@ -899,6 +1037,14 @@
           "stream": {
             "type": "boolean"
           },
+          "stream_options": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/StreamOptions"
+              }
+            ],
+            "nullable": true
+          },
           "temperature": {
             "type": "number",
             "format": "float",
@@ -909,7 +1055,7 @@
           "tool_choice": {
             "allOf": [
               {
-                "$ref": "#/components/schemas/ToolType"
+                "$ref": "#/components/schemas/ToolChoice"
               }
             ],
             "nullable": true
@@ -917,7 +1063,7 @@
           "tool_prompt": {
             "type": "string",
             "description": "A prompt to be appended before the tools",
-            "example": "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"",
+            "example": "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.",
             "nullable": true
           },
           "tools": {
@@ -1116,7 +1262,6 @@
       "CompletionRequest": {
         "type": "object",
         "required": [
-          "model",
           "prompt"
         ],
         "properties": {
@@ -1138,7 +1283,8 @@
           "model": {
             "type": "string",
             "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
-            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+            "example": "mistralai/Mistral-7B-Instruct-v0.2",
+            "nullable": true
           },
           "prompt": {
             "$ref": "#/components/schemas/Prompt"
@@ -1324,6 +1470,17 @@
           }
         }
       },
+      "FunctionName": {
+        "type": "object",
+        "required": [
+          "name"
+        ],
+        "properties": {
+          "name": {
+            "type": "string"
+          }
+        }
+      },
       "GenerateParameters": {
         "type": "object",
         "properties": {
@@ -1569,16 +1726,11 @@
         "type": "object",
         "required": [
           "model_id",
-          "model_dtype",
-          "model_device_type",
           "max_concurrent_requests",
           "max_best_of",
           "max_stop_sequences",
           "max_input_tokens",
           "max_total_tokens",
-          "waiting_served_ratio",
-          "max_batch_total_tokens",
-          "max_waiting_tokens",
           "validation_workers",
           "max_client_batch_size",
           "router",
@@ -1590,18 +1742,6 @@
             "example": "null",
             "nullable": true
           },
-          "max_batch_size": {
-            "type": "integer",
-            "example": "null",
-            "nullable": true,
-            "minimum": 0
-          },
-          "max_batch_total_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "example": "32000",
-            "minimum": 0
-          },
           "max_best_of": {
             "type": "integer",
             "example": "2",
@@ -1633,19 +1773,6 @@
             "example": "2048",
             "minimum": 0
           },
-          "max_waiting_tokens": {
-            "type": "integer",
-            "example": "20",
-            "minimum": 0
-          },
-          "model_device_type": {
-            "type": "string",
-            "example": "cuda"
-          },
-          "model_dtype": {
-            "type": "string",
-            "example": "torch.float16"
-          },
           "model_id": {
             "type": "string",
             "description": "Model info",
@@ -1679,11 +1806,6 @@
           "version": {
             "type": "string",
             "example": "0.5.0"
-          },
-          "waiting_served_ratio": {
-            "type": "number",
-            "format": "float",
-            "example": "1.2"
           }
         }
       },
@@ -1708,6 +1830,101 @@
           }
         }
       },
+      "MessageChunk": {
+        "oneOf": [
+          {
+            "type": "object",
+            "required": [
+              "text",
+              "type"
+            ],
+            "properties": {
+              "text": {
+                "type": "string"
+              },
+              "type": {
+                "type": "string",
+                "enum": [
+                  "text"
+                ]
+              }
+            }
+          },
+          {
+            "type": "object",
+            "required": [
+              "image_url",
+              "type"
+            ],
+            "properties": {
+              "image_url": {
+                "$ref": "#/components/schemas/Url"
+              },
+              "type": {
+                "type": "string",
+                "enum": [
+                  "image_url"
+                ]
+              }
+            }
+          }
+        ],
+        "discriminator": {
+          "propertyName": "type"
+        }
+      },
+      "MessageContent": {
+        "oneOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/MessageChunk"
+            }
+          }
+        ]
+      },
+      "ModelInfo": {
+        "type": "object",
+        "required": [
+          "id",
+          "object",
+          "created",
+          "owned_by"
+        ],
+        "properties": {
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "example": 1686935002,
+            "minimum": 0
+          },
+          "id": {
+            "type": "string",
+            "example": "gpt2"
+          },
+          "object": {
+            "type": "string",
+            "example": "model"
+          },
+          "owned_by": {
+            "type": "string",
+            "example": "openai"
+          }
+        }
+      },
+      "OutputMessage": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/TextMessage"
+          },
+          {
+            "$ref": "#/components/schemas/ToolCallMessage"
+          }
+        ]
+      },
       "PrefillToken": {
         "type": "object",
         "required": [
@@ -1740,6 +1957,45 @@
           "type": "string"
         }
       },
+      "SagemakerRequest": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/CompatGenerateRequest"
+          },
+          {
+            "$ref": "#/components/schemas/ChatRequest"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionRequest"
+          }
+        ]
+      },
+      "SagemakerResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/GenerateResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletion"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionFinal"
+          }
+        ]
+      },
+      "SagemakerStreamResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/StreamResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletionChunk"
+          },
+          {
+            "$ref": "#/components/schemas/Chunk"
+          }
+        ]
+      },
       "SimpleToken": {
         "type": "object",
         "required": [
@@ -1775,7 +2031,8 @@
         "type": "object",
         "required": [
           "finish_reason",
-          "generated_tokens"
+          "generated_tokens",
+          "input_length"
         ],
         "properties": {
           "finish_reason": {
@@ -1787,6 +2044,12 @@
             "example": 1,
             "minimum": 0
           },
+          "input_length": {
+            "type": "integer",
+            "format": "int32",
+            "example": 1,
+            "minimum": 0
+          },
           "seed": {
             "type": "integer",
             "format": "int64",
@@ -1796,6 +2059,19 @@
           }
         }
       },
+      "StreamOptions": {
+        "type": "object",
+        "required": [
+          "include_usage"
+        ],
+        "properties": {
+          "include_usage": {
+            "type": "boolean",
+            "description": "If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.",
+            "example": "true"
+          }
+        }
+      },
       "StreamResponse": {
         "type": "object",
         "required": [
@@ -1834,6 +2110,23 @@
           }
         }
       },
+      "TextMessage": {
+        "type": "object",
+        "required": [
+          "role",
+          "content"
+        ],
+        "properties": {
+          "content": {
+            "type": "string",
+            "example": "My name is David and I"
+          },
+          "role": {
+            "type": "string",
+            "example": "user"
+          }
+        }
+      },
       "Token": {
         "type": "object",
         "required": [
@@ -1906,15 +2199,64 @@
           }
         }
       },
+      "ToolCallDelta": {
+        "type": "object",
+        "required": [
+          "role",
+          "tool_calls"
+        ],
+        "properties": {
+          "role": {
+            "type": "string",
+            "example": "assistant"
+          },
+          "tool_calls": {
+            "$ref": "#/components/schemas/DeltaToolCall"
+          }
+        }
+      },
+      "ToolCallMessage": {
+        "type": "object",
+        "required": [
+          "role",
+          "tool_calls"
+        ],
+        "properties": {
+          "role": {
+            "type": "string",
+            "example": "assistant"
+          },
+          "tool_calls": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ToolCall"
+            }
+          }
+        }
+      },
+      "ToolChoice": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/ToolType"
+          }
+        ],
+        "nullable": true
+      },
       "ToolType": {
         "oneOf": [
           {
-            "type": "object",
-            "default": null,
-            "nullable": true
+            "type": "string",
+            "description": "Means the model can pick between generating a message or calling one or more tools.",
+            "enum": [
+              "auto"
+            ]
           },
           {
-            "type": "string"
+            "type": "string",
+            "description": "Means the model will not call any tool and instead generates a message.",
+            "enum": [
+              "none"
+            ]
           },
           {
             "type": "object",
@@ -1927,7 +2269,20 @@
               }
             }
           }
-        ]
+        ],
+        "description": "Controls which (if any) tool is called by the model.",
+        "example": "auto"
+      },
+      "Url": {
+        "type": "object",
+        "required": [
+          "url"
+        ],
+        "properties": {
+          "url": {
+            "type": "string"
+          }
+        }
       },
       "Usage": {
         "type": "object",
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index c9b4efd982b4a24273bbc4f51efcbe4c84ca3cf9..4876f7c586d4a9c24d19c4e2053c0a94119b9e90 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -3,6 +3,8 @@
     title: Text Generation Inference
   - local: quicktour
     title: Quick Tour
+  - local: supported_models
+    title: Supported Models
   - local: installation_nvidia
     title: Using TGI with Nvidia GPUs
   - local: installation_amd
@@ -11,14 +13,15 @@
     title: Using TGI with Intel Gaudi
   - local: installation_inferentia
     title: Using TGI with AWS Inferentia
+  - local: installation_intel
+    title: Using TGI with Intel GPUs
   - local: installation
     title: Installation from source
-  - local: supported_models
-    title: Supported Models and Hardware
-  - local: messages_api
-    title: Messages API
+
   - local: architecture
     title: Internal Architecture
+  - local: usage_statistics
+    title: Usage Statistics
   title: Getting started
 - sections:
   - local: basic_tutorials/consuming_tgi
@@ -29,8 +32,6 @@
     title: Serving Private & Gated Models
   - local: basic_tutorials/using_cli
     title: Using TGI CLI
-  - local: basic_tutorials/launcher
-    title: All TGI CLI options
   - local: basic_tutorials/non_core_models
     title: Non-core Model Serving
   - local: basic_tutorials/safety
@@ -44,6 +45,14 @@
   - local: basic_tutorials/train_medusa
     title: Train Medusa
   title: Tutorials
+- sections:
+  - local: reference/launcher
+    title: All TGI CLI options
+  - local: reference/metrics
+    title: Exported Metrics
+  - local: reference/api_reference
+    title: API Reference
+  title: Reference
 - sections:
   - local: conceptual/streaming
     title: Streaming
@@ -60,9 +69,11 @@
   - local: conceptual/speculation
     title: Speculation (Medusa, ngram)
   - local: conceptual/guidance
-    title: How Guidance Works (via outlines
+    title: How Guidance Works (via outlines)
   - local: conceptual/lora
     title: LoRA (Low-Rank Adaptation)
+  - local: conceptual/external
+    title: External Resources
 
 
   title: Conceptual Guides
diff --git a/docs/source/architecture.md b/docs/source/architecture.md
index a8418817ebc5156f46b667f0d7a5f053a8d9201b..6660630d0613e871fc31889aa42c69648c5c4a99 100644
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
@@ -10,7 +10,7 @@ This diagram shows well there are these separate components:
 
 - **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
 - **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
-- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
+- **The launcher** is a helper that will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
 
 The router and the model server can be two different machines, they do not need to be deployed together.
 
@@ -103,6 +103,7 @@ Several variants of the model server exist that are actively supported by Huggin
 
 - By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
 - A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
+- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
 - The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
 - A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
 - A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
diff --git a/docs/source/basic_tutorials/consuming_tgi.md b/docs/source/basic_tutorials/consuming_tgi.md
index 4829ec7c9aa3b145e2cdc27959e117ec3350745b..b07e7219d7cb72978a9dc25ca02fa2b5b6e0adc2 100644
--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
@@ -1,81 +1,125 @@
 # Consuming Text Generation Inference
 
-There are many ways you can consume Text Generation Inference server in your applications. After launching, you can use the `/generate` route and make a `POST` request to get results from the server. You can also use the `/generate_stream` route if you want TGI to return a stream of tokens. You can make the requests using the tool of your preference, such as curl, Python or TypeScrpt. For a final end-to-end experience, we also open-sourced ChatUI, a chat interface for open-source models.
+There are many ways to consume Text Generation Inference (TGI) server in your applications. After launching the server, you can use the [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) `/v1/chat/completions` route and make a `POST` request to get results from the server. You can also pass `"stream": true` to the call if you want TGI to return a stream of tokens.
+
+For more information on the API, consult the OpenAPI documentation of `text-generation-inference` available [here](https://huggingface.github.io/text-generation-inference).
+
+You can make the requests using any tool of your preference, such as curl, Python, or TypeScript. For an end-to-end experience, we've open-sourced [ChatUI](https://github.com/huggingface/chat-ui), a chat interface for open-access models.
 
 ## curl
 
-After the launch, you can query the model using either the `/generate` or `/generate_stream` routes:
+After a successful server launch, you can query the model using the `v1/chat/completions` route, to get responses that are compliant to the OpenAI Chat Completion spec:
 
 ```bash
-curl 127.0.0.1:8080/generate \
+curl localhost:8080/v1/chat/completions \
     -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
     -H 'Content-Type: application/json'
 ```
 
-
-## Inference Client
-
-[`huggingface-hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a nice high-level class, [`~huggingface_hub.InferenceClient`], which makes it easy to make calls to a TGI endpoint. `InferenceClient` also takes care of parameter validation and provides a simple to-use interface.
-You can simply install `huggingface-hub` package with pip.
+For non-chat use-cases, you can also use the `/generate` and `/generate_stream` routes.
 
 ```bash
-pip install huggingface-hub
-```
-
-Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.
-
-```python
-from huggingface_hub import InferenceClient
-
-client = InferenceClient(model="http://127.0.0.1:8080")
-client.text_generation(prompt="Write a code for snake game")
+curl 127.0.0.1:8080/generate \
+    -X POST \
+    -d '{
+  "inputs":"What is Deep Learning?",
+  "parameters":{
+    "max_new_tokens":20
+  }
+}' \
+    -H 'Content-Type: application/json'
 ```
 
-You can do streaming with `InferenceClient` by passing `stream=True`. Streaming will return tokens as they are being generated in the server. To use streaming, you can do as follows:
+## Python
 
-```python
-for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
-    print(token)
-```
+### Inference Client
 
-Another parameter you can use with TGI backend is `details`. You can get more details on generation (tokens, probabilities, etc.) by setting `details` to `True`. When it's specified, TGI will return a `TextGenerationResponse` or `TextGenerationStreamResponse` rather than a string or stream.
+[`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a high-level class, [`huggingface_hub.InferenceClient`](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient), which makes it easy to make calls to TGI's Messages API. `InferenceClient` also takes care of parameter validation and provides a simple-to-use interface.
 
-```python
-output = client.text_generation(prompt="Meaning of life is", details=True)
-print(output)
+Install `huggingface_hub` package via pip.
 
-# TextGenerationResponse(generated_text=' a complex concept that is not always clear to the individual. It is a concept that is not always', details=Details(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=20, seed=None, prefill=[], tokens=[Token(id=267, text=' a', logprob=-2.0723474, special=False), Token(id=11235, text=' complex', logprob=-3.1272552, special=False), Token(id=17908, text=' concept', logprob=-1.3632495, special=False),..))
+```bash
+pip install huggingface_hub
 ```
 
-You can see how to stream below.
+You can now use `InferenceClient` the exact same way you would use `OpenAI` client in Python
 
 ```python
-output = client.text_generation(prompt="Meaning of life is", stream=True, details=True)
-print(next(iter(output)))
+from huggingface_hub import InferenceClient
 
-# TextGenerationStreamResponse(token=Token(id=267, text=' a', logprob=-2.0723474, special=False), generated_text=None, details=None)
+client = InferenceClient(
+    base_url="http://localhost:8080/v1/",
+)
+
+output = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Count to 10"},
+    ],
+    stream=True,
+    max_tokens=1024,
+)
+
+for chunk in output:
+    print(chunk.choices[0].delta.content)
 ```
 
-You can check out the details of the function [here](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation). There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)
+You can check out more details about OpenAI compatibility [here](https://huggingface.co/docs/huggingface_hub/en/guides/inference#openai-compatibility).
 
+There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)
 
-## ChatUI
+### OpenAI Client
 
-ChatUI is an open-source interface built for LLM serving. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
+You can directly use the OpenAI [Python](https://github.com/openai/openai-python) or [JS](https://github.com/openai/openai-node) clients to interact with TGI.
 
-To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
+Install the OpenAI Python package via pip.
 
+```bash
+pip install openai
 ```
-{
-// rest of the model config here
-"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
-}
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:8080/v1/",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message)
 ```
 
-![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
+## UI
 
-## Gradio
+### Gradio
 
 Gradio is a Python library that helps you build web applications for your machine learning models with a few lines of code. It has a `ChatInterface` wrapper that helps create neat UIs for chatbots. Let's take a look at how to create a chatbot with streaming mode using TGI and Gradio. Let's install Gradio and Hub Python library first.
 
@@ -89,19 +133,28 @@ Assume you are serving your model on port 8080, we will query through [Inference
 import gradio as gr
 from huggingface_hub import InferenceClient
 
-client = InferenceClient(model="http://127.0.0.1:8080")
+client = InferenceClient(base_url="http://127.0.0.1:8080")
 
 def inference(message, history):
     partial_message = ""
-    for token in client.text_generation(message, max_new_tokens=20, stream=True):
-        partial_message += token
+    output = client.chat.completions.create(
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": message},
+        ],
+        stream=True,
+        max_tokens=1024,
+    )
+
+    for chunk in output:
+        partial_message += chunk.choices[0].delta.content
         yield partial_message
 
 gr.ChatInterface(
     inference,
     chatbot=gr.Chatbot(height=300),
     textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
-    description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.",
+    description="This is the demo for Gradio UI consuming TGI endpoint.",
     title="Gradio 🤝 TGI",
     examples=["Are tomatoes vegetables?"],
     retry_btn="Retry",
@@ -110,20 +163,7 @@ gr.ChatInterface(
 ).queue().launch()
 ```
 
-The UI looks like this 👇
-
-<div class="flex justify-center">
-    <img
-        class="block dark:hidden"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
-    />
-    <img
-        class="hidden dark:block"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
-    />
-</div>
-
-You can try the demo directly here 👇
+You can check out the UI and try the demo directly here 👇
 
 <div class="block dark:hidden">
 	<iframe
@@ -141,15 +181,19 @@ You can try the demo directly here 👇
 </div>
 
 
-You can disable streaming mode using `return` instead of `yield` in your inference function, like below.
+You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).
 
-```python
-def inference(message, history):
-    return client.text_generation(message, max_new_tokens=20)
-```
+### ChatUI
 
-You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).
+[ChatUI](https://github.com/huggingface/chat-ui) is an open-source interface built for consuming LLMs. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
 
-## API documentation
+To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
 
-You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
+```
+{
+// rest of the model config here
+"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
+}
+```
+
+![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index ef3a1db7d2fc1a5ace9488ba5da52081e7d6efc1..8eea19b41e8f87c4365325f53497ac62b0105672 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 \
     --model-id $model
 ```
diff --git a/docs/source/basic_tutorials/preparing_model.md b/docs/source/basic_tutorials/preparing_model.md
index 71ca559880861cfa70f2eeea28b32972201b1a94..456ade442820f993dc320c15c163a52a2f9897b1 100644
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
@@ -4,7 +4,7 @@ Text Generation Inference improves the model in several aspects.
 
 ## Quantization
 
-TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)
+TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [Marlin](https://github.com/IST-DASLab/marlin), [EETQ](https://github.com/NetEase-FuXi/EETQ), [EXL2](https://github.com/turboderp/exllamav2), and [fp8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq`, `awq`, `marlin`, `exl2`, `eetq` or `fp8` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq). Similarly, when using AWQ quantization, you need to point to one of [these models](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)
 
 
 ## RoPE Scaling
diff --git a/docs/source/basic_tutorials/using_guidance.md b/docs/source/basic_tutorials/using_guidance.md
index d0008fdb9324b0cc4b1c93b4556bde62f041a4ca..dfa3f0e49b1591aa6bf38409e08dee4895e80fcd 100644
--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
@@ -4,7 +4,7 @@ Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-
 
 These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
 
-_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
+_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `v1/chat/completions` endpoint._
 
 ## How it works
 
@@ -157,7 +157,12 @@ from huggingface_hub import InferenceClient
 
 client = InferenceClient("http://localhost:3000")
 
-regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
+section_regex = "(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+regexp = f"HELLO\.{section_regex}\.WORLD\.{section_regex}"
+
+# This is a more realistic example of an ip address regex
+# regexp = f"{section_regex}\.{section_regex}\.{section_regex}\.{section_regex}"
+
 
 resp = client.text_generation(
     f"Whats Googles DNS? Please use the following regex: {regexp}",
@@ -170,7 +175,7 @@ resp = client.text_generation(
 
 
 print(resp)
-# 7.1.1.1
+# HELLO.255.WORLD.255
 
 ```
 
diff --git a/docs/source/basic_tutorials/visual_language_models.md b/docs/source/basic_tutorials/visual_language_models.md
index 3770db0b87d07298e3ffecf3b07eecdb5f468363..f152a2f0b2edffab5a7dcfa2e002196805a992d2 100644
--- a/docs/source/basic_tutorials/visual_language_models.md
+++ b/docs/source/basic_tutorials/visual_language_models.md
@@ -84,7 +84,7 @@ print(chat)
 
 ```
 
-or with OpenAi's library:
+or with OpenAI's [client library](https://github.com/openai/openai-python):
 
 ```python
 from openai import OpenAI
diff --git a/docs/source/conceptual/external.md b/docs/source/conceptual/external.md
new file mode 100644
index 0000000000000000000000000000000000000000..9cbe1b5aee9a5b6ec33d448496977288bcb2ba65
--- /dev/null
+++ b/docs/source/conceptual/external.md
@@ -0,0 +1,4 @@
+# External Resources
+
+- Adyen wrote a detailed article about the interplay between TGI's main components: router and server.
+[LLM inference at scale with TGI (Martin Iglesias Goyanes - Adyen, 2024)](https://www.adyen.com/knowledge-hub/llm-inference-at-scale-with-tgi)
diff --git a/docs/source/conceptual/lora.md b/docs/source/conceptual/lora.md
index 08df767c31a2f5dc5bfc4dc34a31b4cb1f234e1d..d1f4ce78408462cf03f2deda21a97a80f9d9e747 100644
--- a/docs/source/conceptual/lora.md
+++ b/docs/source/conceptual/lora.md
@@ -36,6 +36,24 @@ To use LoRA in TGI, when starting the server, you can specify the list of LoRA m
 LORA_ADAPTERS=predibase/customer_support,predibase/dbpedia
 ```
 
+To specify model revision, use `adapter_id@revision`, as follows:
+
+```bash
+LORA_ADAPTERS=predibase/customer_support@main,predibase/dbpedia@rev2
+```
+
+To use a locally stored lora adapter, use `adapter-name=/path/to/adapter`, as seen below. When you want to use this adapter, set `"parameters": {"adapter_id": "adapter-name"}"`
+
+```bash
+LORA_ADAPTERS=myadapter=/some/path/to/adapter,myadapter2=/another/path/to/adapter
+```
+
+note it's possible to mix adapter_ids with adapter_id=adapter_path e.g.
+
+```bash
+LORA_ADAPTERS=predibase/dbpedia,myadapter=/path/to/dir/
+```
+
 In the server logs, you will see the following message:
 
 ```txt
@@ -60,6 +78,22 @@ curl 127.0.0.1:3000/generate \
 }'
 ```
 
+If you are using a lora adapter stored locally that was set in the following manner: `LORA_ADAPTERS=myadapter=/some/path/to/adapter`, here is an example payload:
+
+```json
+curl 127.0.0.1:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+  "inputs": "Hello who are you?",
+  "parameters": {
+    "max_new_tokens": 40,
+    "adapter_id": "myadapter"
+  }
+}'
+```
+
+
 > **Note:** The Lora feature is new and still being improved. If you encounter any issues or have any feedback, please let us know by opening an issue on the [GitHub repository](https://github.com/huggingface/text-generation-inference/issues/new/choose). Additionally documentation and an improved client library will be published soon.
 
 An updated tutorial with detailed examples will be published soon. Stay tuned!
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index 8f26fdba7615fff3fc0d612017c10bbf0348792b..520fba4c053afd0c9aa7259e0b4c76059cbfbcf3 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -1,6 +1,40 @@
 # Quantization
 
-TGI offers GPTQ and bits-and-bytes quantization to quantize large language models.
+TGI offers many quantization schemes to run LLMs effectively and fast based on your use-case. TGI supports GPTQ, AWQ, bits-and-bytes, EETQ, Marlin, EXL2 and fp8 quantization.
+
+To leverage GPTQ, AWQ, Marlin and EXL2 quants, you must provide pre-quantized weights. Whereas for bits-and-bytes, EETQ and fp8, weights are quantized by TGI on the fly.
+
+We recommend using the official quantization scripts for creating your quants:
+1. [AWQ](https://github.com/casper-hansen/AutoAWQ/blob/main/examples/quantize.py)
+2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
+3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)
+
+For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.
+
+## Quantization with bitsandbytes, EETQ & fp8
+
+bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
+
+8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
+In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes
+```
+
+4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
+
+In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes-nf4
+```
+
+You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
+
+Similarly you can use pass you can pass `--quantize eetq` or `--quantize fp8` for respective quantization schemes.
+
+In addition to this, TGI allows creating GPTQ quants directly by passing the model weights and a calibration dataset.
 
 ## Quantization with GPTQ
 
@@ -14,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
@@ -36,24 +70,3 @@ You can learn more about the quantization options by running `text-generation-se
 
 If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
 You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
-
-## Quantization with bitsandbytes
-
-bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
-
-8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
-In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
-
-```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes
-```
-
-4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
-
-In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
-
-```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4
-```
-
-You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
diff --git a/docs/source/conceptual/streaming.md b/docs/source/conceptual/streaming.md
index 71ec9b2586c016dd930a4d46ef72a1e8182e6b8b..b8154ba4355292849740a5f542e719e553383a1e 100644
--- a/docs/source/conceptual/streaming.md
+++ b/docs/source/conceptual/streaming.md
@@ -1,5 +1,6 @@
 # Streaming
 
+
 ## What is Streaming?
 
 Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.
@@ -48,34 +49,29 @@ To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate o
 ```python
 from huggingface_hub import InferenceClient
 
-client = InferenceClient("http://127.0.0.1:8080")
-for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
-    print(token)
-
-# To
-# make
-# cheese
-#,
-# you
-# need
-# to
-# start
-# with
-# milk
-#.
-```
-
-If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text.
-
-```python
-for details in client.text_generation("How do you make cheese?", max_new_tokens=12, details=True, stream=True):
-    print(details)
-
-#TextGenerationStreamResponse(token=Token(id=193, text='\n', logprob=-0.007358551, special=False), generated_text=None, details=None)
-#TextGenerationStreamResponse(token=Token(id=2044, text='To', logprob=-1.1357422, special=False), generated_text=None, details=None)
-#TextGenerationStreamResponse(token=Token(id=717, text=' make', logprob=-0.009841919, special=False), generated_text=None, details=None)
-#...
-#TextGenerationStreamResponse(token=Token(id=25, text='.', logprob=-1.3408203, special=False), generated_text='\nTo make cheese, you need to start with milk.', details=StreamDetails(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=12, seed=None))
+client = InferenceClient(base_url="http://127.0.0.1:8080")
+output = client.chat.completions.create(
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Count to 10"},
+    ],
+    stream=True,
+    max_tokens=1024,
+)
+
+for chunk in output:
+    print(chunk.choices[0].delta.content)
+
+# 1
+# 2
+# 3
+# 4
+# 5
+# 6
+# 7
+# 8
+# 9
+# 10
 ```
 
 The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case you need to handle the requests concurrently.
@@ -83,31 +79,46 @@ The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case
 ```python
 from huggingface_hub import AsyncInferenceClient
 
-client = AsyncInferenceClient("http://127.0.0.1:8080")
-async for token in await client.text_generation("How do you make cheese?", stream=True):
-    print(token)
-
-# To
-# make
-# cheese
-#,
-# you
-# need
-# to
-# start
-# with
-# milk
+client = AsyncInferenceClient(base_url="http://127.0.0.1:8080")
+async def main():
+    stream = await client.chat.completions.create(
+        messages=[{"role": "user", "content": "Say this is a test"}],
+        stream=True,
+    )
+    async for chunk in stream:
+        print(chunk.choices[0].delta.content or "", end="")
+
+asyncio.run(main())
+
+# This
+# is
+# a
+# test
 #.
 ```
 
 ### Streaming with cURL
 
-To use the `generate_stream` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server
+To use the OpenAI Chat Completions compatible Messages API `v1/chat/completions` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server
 
 ```curl
-curl -N 127.0.0.1:8080/generate_stream \
+curl localhost:8080/v1/chat/completions \
     -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
     -H 'Content-Type: application/json'
 ```
 
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index fe925e2a33925890d43380cc265fab5ee19f033f..e2548e5a8775ea376e8bb8d1be1251f6343f60ff 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.0-rocm \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-rocm \
     --model-id $model
 ```
 
@@ -31,6 +31,12 @@ Two implementations of Flash Attention are available for ROCm, the first is [ROC
 
 By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container.
 
+## Custom PagedAttention
+
+For better performance on ROCm, a custom Paged Attention kernel is available and is enabled by default. To disable it and fall back to the PagedAttention v2 kernel, set the environment variable `ROCM_USE_CUSTOM_PAGED_ATTN=0`.
+
+The custom kernel supports bf16 and fp16 data types, block size of 16, head size of 128, a maximum context length of 16k, and GQA ratios between 1 and 16. For other configurations, we use the PagedAttention v2 kernel.
+
 ## Unsupported features
 
 The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0fea30cf1c83c5666677b464e4db5ac394e8949
--- /dev/null
+++ b/docs/source/installation_intel.md
@@ -0,0 +1,36 @@
+# Using TGI with Intel GPUs
+
+TGI optimized models are supported on Intel Data Center GPU [Max1100](https://www.intel.com/content/www/us/en/products/sku/232876/intel-data-center-gpu-max-1100/specifications.html), [Max1550](https://www.intel.com/content/www/us/en/products/sku/232873/intel-data-center-gpu-max-1550/specifications.html), the recommended usage is through Docker.
+
+
+On a server powered by Intel GPUs, TGI can be launched with the following command:
+
+```bash
+model=teknium/OpenHermes-2.5-Mistral-7B
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --rm --privileged --cap-add=sys_nice \
+    --device=/dev/dri \
+    --ipc=host --shm-size 1g --net host -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-intel-xpu \
+    --model-id $model --cuda-graphs 0
+```
+
+# Using TGI with Intel CPUs
+
+Intel® Extension for PyTorch (IPEX) also provides further optimizations for Intel CPUs. The IPEX provides optimization operations such as flash attention, page attention, Add + LayerNorm, ROPE and more.
+
+On a server powered by Intel CPU, TGI can be launched with the following command:
+
+```bash
+model=teknium/OpenHermes-2.5-Mistral-7B
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --rm --privileged --cap-add=sys_nice \
+    --device=/dev/dri \
+    --ipc=host --shm-size 1g --net host -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu \
+    --model-id $model --cuda-graphs 0
+```
+
+The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 11c417634d1f29e64ada2406197e3e4a1e04ddbd..8c50a3f1e9621d9f2149bea6183c80de542d05f1 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.0 \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 09e56df47c3d1087325587cfa9aae35488239eb9..2b1e53ed6e9a4650d0446113f871699fccaf2c47 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,17 +11,25 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.0 \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 \
     --model-id $model
 ```
 
+<Tip>
+
+If you want to serve gated or private models, please refer to
+[this guide](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/gated_model_access)
+for detailed instructions.
+
+</Tip>
+
 ### Supported hardware
 
-TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
+TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Intel GPUs](./installation_intel), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
 
 ## Consuming TGI
 
-Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
+Once TGI is running, you can use the `generate` endpoint or the Open AI Chat Completion API compatible [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
 
 <inferencesnippet>
 <python>
@@ -88,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:2.1.0 --help
+docker run ghcr.io/huggingface/text-generation-inference:2.4.0 --help
 ```
 
 </Tip>
diff --git a/docs/source/messages_api.md b/docs/source/reference/api_reference.md
similarity index 82%
rename from docs/source/messages_api.md
rename to docs/source/reference/api_reference.md
index 250aaae224924a9d096f744b3a4a13be3330730d..9625a957a5bdc474bffcf2e6e58584a35b341d1d 100644
--- a/docs/source/messages_api.md
+++ b/docs/source/reference/api_reference.md
@@ -1,17 +1,30 @@
-# Messages API
+# HTTP API Reference
 
-Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+#### Table of Contents
 
-> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
+- [Text Generation Inference custom API](#text-generation-inference-custom-api)
+- [OpenAI Messages API](#openai-messages-api)
+  - [Making a Request](#making-a-request)
+  - [Streaming](#streaming)
+  - [Synchronous](#synchronous)
+  - [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
+  - [Cloud Providers](#cloud-providers)
+      - [Amazon SageMaker](#amazon-sagemaker)
 
-#### Table of Contents
+The HTTP API is a RESTful API that allows you to interact with the text-generation-inference component. Two endpoints are available:
+* Text Generation Inference [custom API](https://huggingface.github.io/text-generation-inference/)
+* OpenAI's [Messages API](#openai-messages-api)
+
+
+## Text Generation Inference custom API
 
-- [Making a Request](#making-a-request)
-- [Streaming](#streaming)
-- [Synchronous](#synchronous)
-- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
-- [Cloud Providers](#cloud-providers)
-  - [Amazon SageMaker](#amazon-sagemaker)
+Check the [API documentation](https://huggingface.github.io/text-generation-inference/) for more information on how to interact with the Text Generation Inference API.
+
+## OpenAI Messages API
+
+Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+
+> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
 
 ## Making a Request
 
@@ -128,9 +141,7 @@ TGI can be deployed on various cloud providers for scalable and robust text gene
 
 ## Amazon SageMaker
 
-To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
-
-This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+Amazon Sagemaker natively supports the message API:
 
 ```python
 import json
@@ -148,12 +159,11 @@ except ValueError:
 hub = {
  'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
  'SM_NUM_GPUS': json.dumps(1),
- 'MESSAGES_API_ENABLED': True
 }
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="2.4.0"),
  env=hub,
  role=role,
 )
diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/reference/launcher.md
similarity index 93%
rename from docs/source/basic_tutorials/launcher.md
rename to docs/source/reference/launcher.md
index 5e40146f58a7e0e27f98e03b1dffef04b9b7b755..68e487d0a73349de09348e2ceb74249c2137b791 100644
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -55,7 +55,9 @@ Options:
 ## QUANTIZE
 ```shell
       --quantize <QUANTIZE>
-          Whether you want the model to be quantized
+          Quantization method to use for the model. It is not necessary to specify this option for pre-quantized models, since the quantization method is read from the model configuration.
+          
+          Marlin kernels will be used automatically for GPTQ/AWQ models.
           
           [env: QUANTIZE=]
 
@@ -87,6 +89,15 @@ Options:
           [env: DTYPE=]
           [possible values: float16, bfloat16]
 
+```
+## KV_CACHE_DTYPE
+```shell
+      --kv-cache-dtype <KV_CACHE_DTYPE>
+          Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA
+          
+          [env: KV_CACHE_DTYPE=]
+          [possible values: fp8_e4m3fn, fp8_e5m2]
+
 ```
 ## TRUST_REMOTE_CODE
 ```shell
@@ -349,6 +360,12 @@ Options:
       --cors-allow-origin <CORS_ALLOW_ORIGIN>
           [env: CORS_ALLOW_ORIGIN=]
 
+```
+## API_KEY
+```shell
+      --api-key <API_KEY>
+          [env: API_KEY=]
+
 ```
 ## WATERMARK_GAMMA
 ```shell
@@ -424,6 +441,20 @@ Options:
           
           [env: LORA_ADAPTERS=]
 
+```
+## USAGE_STATS
+```shell
+      --usage-stats <USAGE_STATS>
+          Control if anonymous usage stats are collected. Options are "on", "off" and "no-stack" Defaul is on
+          
+          [env: USAGE_STATS=]
+          [default: on]
+
+          Possible values:
+          - on:       Default option, usage statistics are collected anonymously
+          - off:      Disables all collection of usage statistics
+          - no-stack: Doesn't send the error stack trace or error type, but allows sending a crash event
+
 ```
 ## HELP
 ```shell
diff --git a/docs/source/reference/metrics.md b/docs/source/reference/metrics.md
new file mode 100644
index 0000000000000000000000000000000000000000..d34d38eab7d94563cb54949c28c6ad044305a672
--- /dev/null
+++ b/docs/source/reference/metrics.md
@@ -0,0 +1,30 @@
+# Metrics
+
+TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint.
+These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks.
+
+The following metrics are exposed:
+
+| Metric Name                                | Description                                                                              | Type      | Unit    |
+|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------|
+| `tgi_batch_current_max_tokens`             | Maximum tokens for the current batch                                                     | Gauge     | Count   |
+| `tgi_batch_current_size`                   | Current batch size                                                                       | Gauge     | Count   |
+| `tgi_batch_decode_duration`                | Time spent decoding a batch per method (prefill or decode)                               | Histogram | Seconds |
+| `tgi_batch_filter_duration`                | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds |
+| `tgi_batch_forward_duration`               | Batch forward duration per method (prefill or decode)                                    | Histogram | Seconds |
+| `tgi_batch_inference_count`                | Inference calls per method (prefill or decode)                                           | Counter   | Count   |
+| `tgi_batch_inference_duration`             | Batch inference duration                                                                 | Histogram | Seconds |
+| `tgi_batch_inference_success`              | Number of successful inference calls per method (prefill or decode)                      | Counter   | Count   |
+| `tgi_batch_next_size`                      | Batch size of the next batch                                                             | Histogram | Count   |
+| `tgi_queue_size`                           | Current queue size                                                                       | Gauge     | Count   |
+| `tgi_request_count`                        | Total number of requests                                                                 | Counter   | Count   |
+| `tgi_request_duration`                     | Total time spent processing the request (e2e latency)                                    | Histogram | Seconds |
+| `tgi_request_generated_tokens`             | Generated tokens per request                                                             | Histogram | Count   |
+| `tgi_request_inference_duration`           | Request inference duration                                                               | Histogram | Seconds |
+| `tgi_request_input_length`                 | Input token length per request                                                           | Histogram | Count   |
+| `tgi_request_max_new_tokens`               | Maximum new tokens per request                                                           | Histogram | Count   |
+| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency)                                    | Histogram | Seconds |
+| `tgi_request_queue_duration`               | Time spent in the queue per request                                                      | Histogram | Seconds |
+| `tgi_request_skipped_tokens`               | Speculated tokens per request                                                            | Histogram | Count   |
+| `tgi_request_success`                      | Number of successful requests                                                            | Counter   |         |
+| `tgi_request_validation_duration`          | Time spent validating the request                                                        | Histogram | Seconds |
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index 1eeed39f8127a1c2ad7391a7348ac8b7859524b5..ede1fc778f376619c85b7b14d24f566fde070d07 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -1,23 +1,25 @@
 
-# Supported Models and Hardware
+# Supported Models
 
-Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported.
-
-## Supported Models
+Text Generation Inference enables serving optimized models. The following sections list which models (VLMs & LLMs) are supported.
 
+- [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
-- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+- [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
+- [Granite](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
 - [Gemma](https://huggingface.co/google/gemma-7b)
-- [Gemma2](https://huggingface.co/google/gemma2-9b)
+- [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
+- [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
 - [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
 - [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
 - [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj)
-- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [Mistral](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)
 - [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
 - [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder)
 - [Phi](https://huggingface.co/microsoft/phi-1_5)
+- [PhiMoe](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)
 - [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)
 - [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct)
 - [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
@@ -30,7 +32,10 @@ Text Generation Inference enables serving optimized models on specific hardware
 - [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct)
 - [Gpt2](https://huggingface.co/openai-community/gpt2)
 - [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
+- [Gptj](https://huggingface.co/EleutherAI/gpt-j-6b)
 - [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
+- [Mllama](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) (Multimodal)
+
 
 
 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md
new file mode 100644
index 0000000000000000000000000000000000000000..d3878b53ccbee3f9153d3536e25e3a4d5431174d
--- /dev/null
+++ b/docs/source/usage_statistics.md
@@ -0,0 +1,74 @@
+
+# Collection of Usage Statistics
+
+Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.
+
+Data is sent twice, once on server startup and once when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine.
+
+## What data is collected
+
+The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/router/src/usage_stats.rs).
+As of release 2.1.2 this is an example of the data collected:
+
+- From the TGI configuration:
+```json
+{
+  "event_type": "start",
+  "disable_grammar_support": false,
+  "max_batch_prefill_tokens": 4096,
+  "max_batch_size": null,
+  "max_batch_total_tokens": null,
+  "max_best_of": 2,
+  "max_client_batch_size": 4,
+  "max_concurrent_requests": 128,
+  "max_input_tokens": 1024,
+  "max_stop_sequences": 4,
+  "max_top_n_tokens": 5,
+  "max_total_tokens": 2048,
+  "max_waiting_tokens": 20,
+  "model_config": {
+    "model_type": "Bloom"
+  },
+  "revision": null,
+  "tokenizer_class": "BloomTokenizerFast",
+  "validation_workers": 2,
+  "waiting_served_ratio": 1.2,
+  "docker_label": "latest",
+  "git_sha": "cfc118704880453d29bcbe4fbbd91dda501cf5fe",
+  "nvidia_env": {
+    "name": "NVIDIA A10G",
+    "pci_bus_id": "00000000:00:1E.0",
+    "driver_version": "535.183.01",
+    "pstate": "P8",
+    "pcie_link_gen_max": "4",
+    "pcie_link_gen_current": "1",
+    "temperature_gpu": "31",
+    "utilization_gpu": "0 %",
+    "utilization_memory": "0 %",
+    "memory_total": "23028 MiB",
+    "memory_free": "22515 MiB",
+    "memory_used": "0 MiB",
+    "reset_status_reset_required": "No",
+    "reset_status_drain_and_reset_recommended": "No",
+    "compute_cap": "8.6",
+    "ecc_errors_corrected_volatile_total": "0",
+    "mig_mode_current": "[N/A]",
+    "power_draw_instant": "10.86 W",
+    "power_limit": "300.00 W"
+  },
+  "system_env": {
+    "cpu_count": 16,
+    "cpu_type": "AMD EPYC 7R32",
+    "total_memory": 66681196544,
+    "architecture": "x86_64",
+    "platform": "linux-unix-x86_64"
+  }
+}
+
+```
+
+## How to opt-out
+
+By passing the `--usage-stats` to the text-generation-launcher you can control how much usage statistics are being collected.
+`--usage-stats=no-stack` will not emit the stack traces from errors and the error types, but will continue to send start and stop events
+`--usage-stats=off` will completely disable everything
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000000000000000000000000000000000000..1706385a1558fc405019cfe529b3f4ffef689e25
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,998 @@
+{
+  "nodes": {
+    "cachix": {
+      "inputs": {
+        "devenv": [
+          "crate2nix"
+        ],
+        "flake-compat": [
+          "crate2nix"
+        ],
+        "nixpkgs": "nixpkgs",
+        "pre-commit-hooks": [
+          "crate2nix"
+        ]
+      },
+      "locked": {
+        "lastModified": 1709700175,
+        "narHash": "sha256-A0/6ZjLmT9qdYzKHmevnEIC7G+GiZ4UCr8v0poRPzds=",
+        "owner": "cachix",
+        "repo": "cachix",
+        "rev": "be97b37989f11b724197b5f4c7ffd78f12c8c4bf",
+        "type": "github"
+      },
+      "original": {
+        "owner": "cachix",
+        "ref": "latest",
+        "repo": "cachix",
+        "type": "github"
+      }
+    },
+    "cachix_2": {
+      "inputs": {
+        "devenv": [
+          "crate2nix",
+          "crate2nix_stable"
+        ],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable"
+        ],
+        "nixpkgs": "nixpkgs_2",
+        "pre-commit-hooks": [
+          "crate2nix",
+          "crate2nix_stable"
+        ]
+      },
+      "locked": {
+        "lastModified": 1716549461,
+        "narHash": "sha256-lHy5kgx6J8uD+16SO47dPrbob98sh+W1tf4ceSqPVK4=",
+        "owner": "cachix",
+        "repo": "cachix",
+        "rev": "e2bb269fb8c0828d5d4d2d7b8d09ea85abcacbd4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "cachix",
+        "ref": "latest",
+        "repo": "cachix",
+        "type": "github"
+      }
+    },
+    "cachix_3": {
+      "inputs": {
+        "devenv": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ],
+        "nixpkgs": "nixpkgs_3",
+        "pre-commit-hooks": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ]
+      },
+      "locked": {
+        "lastModified": 1716549461,
+        "narHash": "sha256-lHy5kgx6J8uD+16SO47dPrbob98sh+W1tf4ceSqPVK4=",
+        "owner": "cachix",
+        "repo": "cachix",
+        "rev": "e2bb269fb8c0828d5d4d2d7b8d09ea85abcacbd4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "cachix",
+        "ref": "latest",
+        "repo": "cachix",
+        "type": "github"
+      }
+    },
+    "crate2nix": {
+      "inputs": {
+        "cachix": "cachix",
+        "crate2nix_stable": "crate2nix_stable",
+        "devshell": "devshell_3",
+        "flake-compat": "flake-compat_3",
+        "flake-parts": "flake-parts_3",
+        "nix-test-runner": "nix-test-runner_3",
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ],
+        "pre-commit-hooks": "pre-commit-hooks_3"
+      },
+      "locked": {
+        "lastModified": 1723311214,
+        "narHash": "sha256-xdGZQBEa1AC2us/sY3igS/CucWY6jErXsAvCFRhB2LI=",
+        "owner": "nix-community",
+        "repo": "crate2nix",
+        "rev": "236f6addfd452a48be805819e3216af79e988fd5",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "crate2nix",
+        "type": "github"
+      }
+    },
+    "crate2nix_stable": {
+      "inputs": {
+        "cachix": "cachix_2",
+        "crate2nix_stable": "crate2nix_stable_2",
+        "devshell": "devshell_2",
+        "flake-compat": "flake-compat_2",
+        "flake-parts": "flake-parts_2",
+        "nix-test-runner": "nix-test-runner_2",
+        "nixpkgs": "nixpkgs_5",
+        "pre-commit-hooks": "pre-commit-hooks_2"
+      },
+      "locked": {
+        "lastModified": 1719760004,
+        "narHash": "sha256-esWhRnt7FhiYq0CcIxw9pvH+ybOQmWBfHYMtleaMhBE=",
+        "owner": "nix-community",
+        "repo": "crate2nix",
+        "rev": "1dee214bb20855fa3e1e7bb98d28922ddaff8c57",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "ref": "0.14.1",
+        "repo": "crate2nix",
+        "type": "github"
+      }
+    },
+    "crate2nix_stable_2": {
+      "inputs": {
+        "cachix": "cachix_3",
+        "crate2nix_stable": "crate2nix_stable_3",
+        "devshell": "devshell",
+        "flake-compat": "flake-compat",
+        "flake-parts": "flake-parts",
+        "nix-test-runner": "nix-test-runner",
+        "nixpkgs": "nixpkgs_4",
+        "pre-commit-hooks": "pre-commit-hooks"
+      },
+      "locked": {
+        "lastModified": 1712821484,
+        "narHash": "sha256-rGT3CW64cJS9nlnWPFWSc1iEa3dNZecVVuPVGzcsHe8=",
+        "owner": "nix-community",
+        "repo": "crate2nix",
+        "rev": "42883afcad3823fa5811e967fb7bff54bc3c9d6d",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "ref": "0.14.0",
+        "repo": "crate2nix",
+        "type": "github"
+      }
+    },
+    "crate2nix_stable_3": {
+      "inputs": {
+        "flake-utils": "flake-utils"
+      },
+      "locked": {
+        "lastModified": 1702842982,
+        "narHash": "sha256-A9AowkHIjsy1a4LuiPiVP88FMxyCWK41flZEZOUuwQM=",
+        "owner": "nix-community",
+        "repo": "crate2nix",
+        "rev": "75ac2973affa6b9b4f661a7b592cba6e4f51d426",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "ref": "0.12.0",
+        "repo": "crate2nix",
+        "type": "github"
+      }
+    },
+    "devshell": {
+      "inputs": {
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1717408969,
+        "narHash": "sha256-Q0OEFqe35fZbbRPPRdrjTUUChKVhhWXz3T9ZSKmaoVY=",
+        "owner": "numtide",
+        "repo": "devshell",
+        "rev": "1ebbe68d57457c8cae98145410b164b5477761f4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "devshell",
+        "type": "github"
+      }
+    },
+    "devshell_2": {
+      "inputs": {
+        "flake-utils": "flake-utils_3",
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1717408969,
+        "narHash": "sha256-Q0OEFqe35fZbbRPPRdrjTUUChKVhhWXz3T9ZSKmaoVY=",
+        "owner": "numtide",
+        "repo": "devshell",
+        "rev": "1ebbe68d57457c8cae98145410b164b5477761f4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "devshell",
+        "type": "github"
+      }
+    },
+    "devshell_3": {
+      "inputs": {
+        "flake-utils": "flake-utils_4",
+        "nixpkgs": [
+          "crate2nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1711099426,
+        "narHash": "sha256-HzpgM/wc3aqpnHJJ2oDqPBkNsqWbW0WfWUO8lKu8nGk=",
+        "owner": "numtide",
+        "repo": "devshell",
+        "rev": "2d45b54ca4a183f2fdcf4b19c895b64fbf620ee8",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "devshell",
+        "type": "github"
+      }
+    },
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1696426674,
+        "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
+        "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
+        "revCount": 57,
+        "type": "tarball",
+        "url": "https://api.flakehub.com/f/pinned/edolstra/flake-compat/1.0.1/018afb31-abd1-7bff-a5e4-cff7e18efb7a/source.tar.gz"
+      },
+      "original": {
+        "type": "tarball",
+        "url": "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1696426674,
+        "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
+        "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
+        "revCount": 57,
+        "type": "tarball",
+        "url": "https://api.flakehub.com/f/pinned/edolstra/flake-compat/1.0.1/018afb31-abd1-7bff-a5e4-cff7e18efb7a/source.tar.gz"
+      },
+      "original": {
+        "type": "tarball",
+        "url": "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz"
+      }
+    },
+    "flake-compat_3": {
+      "locked": {
+        "lastModified": 1696426674,
+        "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
+        "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
+        "revCount": 57,
+        "type": "tarball",
+        "url": "https://api.flakehub.com/f/pinned/edolstra/flake-compat/1.0.1/018afb31-abd1-7bff-a5e4-cff7e18efb7a/source.tar.gz"
+      },
+      "original": {
+        "type": "tarball",
+        "url": "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz"
+      }
+    },
+    "flake-compat_4": {
+      "locked": {
+        "lastModified": 1696426674,
+        "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-parts": {
+      "inputs": {
+        "nixpkgs-lib": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1719745305,
+        "narHash": "sha256-xwgjVUpqSviudEkpQnioeez1Uo2wzrsMaJKJClh+Bls=",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "rev": "c3c5ecc05edc7dafba779c6c1a61cd08ac6583e9",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "type": "github"
+      }
+    },
+    "flake-parts_2": {
+      "inputs": {
+        "nixpkgs-lib": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1719745305,
+        "narHash": "sha256-xwgjVUpqSviudEkpQnioeez1Uo2wzrsMaJKJClh+Bls=",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "rev": "c3c5ecc05edc7dafba779c6c1a61cd08ac6583e9",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "type": "github"
+      }
+    },
+    "flake-parts_3": {
+      "inputs": {
+        "nixpkgs-lib": [
+          "crate2nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1712014858,
+        "narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1694529238,
+        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1701680307,
+        "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "4022d587cbbfd70fe950c1e2083a02621806a725",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_3": {
+      "inputs": {
+        "systems": "systems_3"
+      },
+      "locked": {
+        "lastModified": 1701680307,
+        "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "4022d587cbbfd70fe950c1e2083a02621806a725",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_4": {
+      "inputs": {
+        "systems": "systems_4"
+      },
+      "locked": {
+        "lastModified": 1701680307,
+        "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "4022d587cbbfd70fe950c1e2083a02621806a725",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_5": {
+      "inputs": {
+        "systems": "systems_5"
+      },
+      "locked": {
+        "lastModified": 1710146030,
+        "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_6": {
+      "inputs": {
+        "systems": "systems_6"
+      },
+      "locked": {
+        "lastModified": 1726560853,
+        "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_7": {
+      "inputs": {
+        "systems": "systems_7"
+      },
+      "locked": {
+        "lastModified": 1726560853,
+        "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "gitignore": {
+      "inputs": {
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable",
+          "pre-commit-hooks",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1709087332,
+        "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=",
+        "owner": "hercules-ci",
+        "repo": "gitignore.nix",
+        "rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "gitignore.nix",
+        "type": "github"
+      }
+    },
+    "gitignore_2": {
+      "inputs": {
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "pre-commit-hooks",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1709087332,
+        "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=",
+        "owner": "hercules-ci",
+        "repo": "gitignore.nix",
+        "rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "gitignore.nix",
+        "type": "github"
+      }
+    },
+    "gitignore_3": {
+      "inputs": {
+        "nixpkgs": [
+          "crate2nix",
+          "pre-commit-hooks",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1709087332,
+        "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=",
+        "owner": "hercules-ci",
+        "repo": "gitignore.nix",
+        "rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "gitignore.nix",
+        "type": "github"
+      }
+    },
+    "nix-filter": {
+      "locked": {
+        "lastModified": 1710156097,
+        "narHash": "sha256-1Wvk8UP7PXdf8bCCaEoMnOT1qe5/Duqgj+rL8sRQsSM=",
+        "owner": "numtide",
+        "repo": "nix-filter",
+        "rev": "3342559a24e85fc164b295c3444e8a139924675b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "nix-filter",
+        "type": "github"
+      }
+    },
+    "nix-test-runner": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1588761593,
+        "narHash": "sha256-FKJykltAN/g3eIceJl4SfDnnyuH2jHImhMrXS2KvGIs=",
+        "owner": "stoeffel",
+        "repo": "nix-test-runner",
+        "rev": "c45d45b11ecef3eb9d834c3b6304c05c49b06ca2",
+        "type": "github"
+      },
+      "original": {
+        "owner": "stoeffel",
+        "repo": "nix-test-runner",
+        "type": "github"
+      }
+    },
+    "nix-test-runner_2": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1588761593,
+        "narHash": "sha256-FKJykltAN/g3eIceJl4SfDnnyuH2jHImhMrXS2KvGIs=",
+        "owner": "stoeffel",
+        "repo": "nix-test-runner",
+        "rev": "c45d45b11ecef3eb9d834c3b6304c05c49b06ca2",
+        "type": "github"
+      },
+      "original": {
+        "owner": "stoeffel",
+        "repo": "nix-test-runner",
+        "type": "github"
+      }
+    },
+    "nix-test-runner_3": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1588761593,
+        "narHash": "sha256-FKJykltAN/g3eIceJl4SfDnnyuH2jHImhMrXS2KvGIs=",
+        "owner": "stoeffel",
+        "repo": "nix-test-runner",
+        "rev": "c45d45b11ecef3eb9d834c3b6304c05c49b06ca2",
+        "type": "github"
+      },
+      "original": {
+        "owner": "stoeffel",
+        "repo": "nix-test-runner",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1700612854,
+        "narHash": "sha256-yrQ8osMD+vDLGFX7pcwsY/Qr5PUd6OmDMYJZzZi0+zc=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "19cbff58383a4ae384dea4d1d0c823d72b49d614",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs_2": {
+      "locked": {
+        "lastModified": 1715534503,
+        "narHash": "sha256-5ZSVkFadZbFP1THataCaSf0JH2cAH3S29hU9rrxTEqk=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "2057814051972fa1453ddfb0d98badbea9b83c06",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs_3": {
+      "locked": {
+        "lastModified": 1715534503,
+        "narHash": "sha256-5ZSVkFadZbFP1THataCaSf0JH2cAH3S29hU9rrxTEqk=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "2057814051972fa1453ddfb0d98badbea9b83c06",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs_4": {
+      "locked": {
+        "lastModified": 1719506693,
+        "narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
+        "path": "/nix/store/4p0avw1s3vf27hspgqsrqs37gxk4i83i-source",
+        "rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
+        "type": "path"
+      },
+      "original": {
+        "id": "nixpkgs",
+        "type": "indirect"
+      }
+    },
+    "nixpkgs_5": {
+      "locked": {
+        "lastModified": 1719506693,
+        "narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
+        "path": "/nix/store/4p0avw1s3vf27hspgqsrqs37gxk4i83i-source",
+        "rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
+        "type": "path"
+      },
+      "original": {
+        "id": "nixpkgs",
+        "type": "indirect"
+      }
+    },
+    "nixpkgs_6": {
+      "locked": {
+        "lastModified": 1727675176,
+        "narHash": "sha256-xIjBFMYldWvj+g8ahxMPofsj+OqxvKJN6YylNHQ7gn4=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "a6d0207fea9212d28cd3d487efe6bc699663b93a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "pre-commit-hooks": {
+      "inputs": {
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable",
+          "flake-compat"
+        ],
+        "gitignore": "gitignore",
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1719259945,
+        "narHash": "sha256-F1h+XIsGKT9TkGO3omxDLEb/9jOOsI6NnzsXFsZhry4=",
+        "owner": "cachix",
+        "repo": "pre-commit-hooks.nix",
+        "rev": "0ff4381bbb8f7a52ca4a851660fc7a437a4c6e07",
+        "type": "github"
+      },
+      "original": {
+        "owner": "cachix",
+        "repo": "pre-commit-hooks.nix",
+        "type": "github"
+      }
+    },
+    "pre-commit-hooks_2": {
+      "inputs": {
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "flake-compat"
+        ],
+        "gitignore": "gitignore_2",
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1719259945,
+        "narHash": "sha256-F1h+XIsGKT9TkGO3omxDLEb/9jOOsI6NnzsXFsZhry4=",
+        "owner": "cachix",
+        "repo": "pre-commit-hooks.nix",
+        "rev": "0ff4381bbb8f7a52ca4a851660fc7a437a4c6e07",
+        "type": "github"
+      },
+      "original": {
+        "owner": "cachix",
+        "repo": "pre-commit-hooks.nix",
+        "type": "github"
+      }
+    },
+    "pre-commit-hooks_3": {
+      "inputs": {
+        "flake-compat": [
+          "crate2nix",
+          "flake-compat"
+        ],
+        "flake-utils": "flake-utils_5",
+        "gitignore": "gitignore_3",
+        "nixpkgs": [
+          "crate2nix",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1712055707,
+        "narHash": "sha256-4XLvuSIDZJGS17xEwSrNuJLL7UjDYKGJSbK1WWX2AK8=",
+        "owner": "cachix",
+        "repo": "pre-commit-hooks.nix",
+        "rev": "e35aed5fda3cc79f88ed7f1795021e559582093a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "cachix",
+        "repo": "pre-commit-hooks.nix",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "crate2nix": "crate2nix",
+        "flake-utils": "flake-utils_6",
+        "nix-filter": "nix-filter",
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ],
+        "rust-overlay": "rust-overlay",
+        "tgi-nix": "tgi-nix"
+      }
+    },
+    "rust-overlay": {
+      "inputs": {
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1727836133,
+        "narHash": "sha256-JE0zciM5IGWvK8J/pE2VldNBf7oyMH5WrU8tZArefbg=",
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "rev": "02321540b0c8000b36889b1b974d1fec585b25a4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "type": "github"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_3": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_4": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_5": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_6": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_7": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "tgi-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_4",
+        "flake-utils": "flake-utils_7",
+        "nixpkgs": "nixpkgs_6"
+      },
+      "locked": {
+        "lastModified": 1729761651,
+        "narHash": "sha256-GYykQ9Fxji2EuXCGcPn0dx8Qx8VQBJTkRdcCytp4A/k=",
+        "owner": "huggingface",
+        "repo": "text-generation-inference-nix",
+        "rev": "f7e3c4fa67d70590ed9ee47feeab645bd9ba81b1",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "ref": "marlin-kernels-0.3.1",
+        "repo": "text-generation-inference-nix",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000000000000000000000000000000000000..45441caeec6b7bb8077715ff7522d2a9b4e3c8bd
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,173 @@
+{
+  inputs = {
+    crate2nix = {
+      url = "github:nix-community/crate2nix";
+      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
+    };
+    nix-filter.url = "github:numtide/nix-filter";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.1";
+    nixpkgs.follows = "tgi-nix/nixpkgs";
+    flake-utils.url = "github:numtide/flake-utils";
+    rust-overlay = {
+      url = "github:oxalica/rust-overlay";
+      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
+    };
+  };
+  outputs =
+    {
+      self,
+      crate2nix,
+      nix-filter,
+      nixpkgs,
+      flake-utils,
+      rust-overlay,
+      tgi-nix,
+    }:
+    flake-utils.lib.eachDefaultSystem (
+      system:
+      let
+        cargoNix = crate2nix.tools.${system}.appliedCargoNix {
+          name = "tgi";
+          src = ./.;
+          additionalCargoNixArgs = [ "--all-features" ];
+        };
+        pkgs = import nixpkgs {
+          inherit system;
+          inherit (tgi-nix.lib) config;
+          overlays = [
+            rust-overlay.overlays.default
+            tgi-nix.overlays.default
+            (import nix/overlay.nix)
+          ];
+        };
+        crateOverrides = import ./nix/crate-overrides.nix { inherit pkgs nix-filter; };
+        benchmark = cargoNix.workspaceMembers.text-generation-benchmark.build.override {
+          inherit crateOverrides;
+        };
+        launcher = cargoNix.workspaceMembers.text-generation-launcher.build.override {
+          inherit crateOverrides;
+        };
+        router =
+          let
+            routerUnwrapped = cargoNix.workspaceMembers.text-generation-router-v3.build.override {
+              inherit crateOverrides;
+            };
+            packagePath =
+              with pkgs.python3.pkgs;
+              makePythonPath [
+                protobuf
+                sentencepiece
+                torch
+                transformers
+              ];
+          in
+          pkgs.writeShellApplication {
+            name = "text-generation-router";
+            text = ''
+              PYTHONPATH="${packagePath}" ${routerUnwrapped}/bin/text-generation-router "$@"
+            '';
+          };
+        server = pkgs.python3.pkgs.callPackage ./nix/server.nix { inherit nix-filter; };
+        client = pkgs.python3.pkgs.callPackage ./nix/client.nix { };
+      in
+      {
+        checks = {
+          rust =
+            with pkgs;
+            rustPlatform.buildRustPackage {
+              name = "rust-checks";
+              src = ./.;
+              cargoLock = {
+                lockFile = ./Cargo.lock;
+              };
+              buildInputs = [ openssl.dev ];
+              nativeBuildInputs = [
+                clippy
+                pkg-config
+                protobuf
+                python3
+                rustfmt
+              ];
+              buildPhase = ''
+                cargo check
+              '';
+              checkPhase = ''
+                cargo fmt -- --check
+                cargo test -j $NIX_BUILD_CORES
+                cargo clippy
+              '';
+              installPhase = "touch $out";
+            };
+        };
+        formatter = pkgs.nixfmt-rfc-style;
+        devShells = with pkgs; rec {
+          default = pure;
+
+          pure = mkShell {
+            buildInputs = [
+              benchmark
+              launcher
+              router
+              server
+            ];
+          };
+          test = mkShell {
+            buildInputs =
+              [
+                benchmark
+                launcher
+                router
+                server
+                client
+                openssl.dev
+                pkg-config
+                cargo
+                rustfmt
+                clippy
+              ]
+              ++ (with python3.pkgs; [
+                docker
+                pytest
+                pytest-asyncio
+                syrupy
+                pre-commit
+                ruff
+              ]);
+          };
+
+          impure = callPackage ./nix/impure-shell.nix { inherit server; };
+
+          impureWithCuda = callPackage ./nix/impure-shell.nix {
+            inherit server;
+            withCuda = true;
+          };
+
+          impure-flash-attn-v1 = callPackage ./nix/impure-shell.nix {
+            server = server.override { flash-attn = python3.pkgs.flash-attn-v1; };
+          };
+        };
+
+        packages = rec {
+          default = pkgs.writeShellApplication {
+            name = "text-generation-inference";
+            runtimeInputs = [
+              server
+              router
+            ];
+            text = ''
+              ${launcher}/bin/text-generation-launcher "$@"
+            '';
+          };
+
+          dockerImage = pkgs.callPackage nix/docker.nix {
+            text-generation-inference = default;
+          };
+
+          dockerImageStreamed = pkgs.callPackage nix/docker.nix {
+            text-generation-inference = default;
+            stream = true;
+          };
+        };
+      }
+    );
+}
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index f5f38ac6c0f7643378294d792990783986f7b931..356fa5e30acf9091aa8e3e066d28ce71dbd0f275 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -4,22 +4,25 @@ import json
 import math
 import os
 import random
-import re
 import shutil
 import subprocess
 import sys
 import tempfile
 import time
-from typing import Dict, List, Optional
-
 import docker
 import pytest
+import base64
+
+from pathlib import Path
+from typing import Dict, List, Optional
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 from syrupy.extensions.json import JSONSnapshotExtension
+
 from text_generation import AsyncClient
 from text_generation.types import (
     BestOfSequence,
+    Message,
     ChatComplete,
     ChatCompletionChunk,
     ChatCompletionComplete,
@@ -65,6 +68,7 @@ class ResponseComparator(JSONSnapshotExtension):
         self,
         data,
         *,
+        include=None,
         exclude=None,
         matcher=None,
     ):
@@ -80,7 +84,12 @@ class ResponseComparator(JSONSnapshotExtension):
             data = [d.model_dump() for d in data]
 
         data = self._filter(
-            data=data, depth=0, path=(), exclude=exclude, matcher=matcher
+            data=data,
+            depth=0,
+            path=(),
+            exclude=exclude,
+            include=include,
+            matcher=matcher,
         )
         return json.dumps(data, indent=2, ensure_ascii=False, sort_keys=False) + "\n"
 
@@ -92,25 +101,25 @@ class ResponseComparator(JSONSnapshotExtension):
     ) -> bool:
         def convert_data(data):
             data = json.loads(data)
-            if isinstance(data, Dict) and "choices" in data:
-                choices = data["choices"]
-                if isinstance(choices, List) and len(choices) >= 1:
-                    if "delta" in choices[0]:
-                        return ChatCompletionChunk(**data)
-                    if "text" in choices[0]:
-                        return Completion(**data)
-                return ChatComplete(**data)
+            return _convert_data(data)
 
+        def _convert_data(data):
             if isinstance(data, Dict):
-                return Response(**data)
+                if "choices" in data:
+                    data["choices"] = list(
+                        sorted(data["choices"], key=lambda x: x["index"])
+                    )
+                    choices = data["choices"]
+                    if isinstance(choices, List) and len(choices) >= 1:
+                        if "delta" in choices[0]:
+                            return ChatCompletionChunk(**data)
+                        if "text" in choices[0]:
+                            return Completion(**data)
+                    return ChatComplete(**data)
+                else:
+                    return Response(**data)
             if isinstance(data, List):
-                if (
-                    len(data) > 0
-                    and "object" in data[0]
-                    and data[0]["object"] == "text_completion"
-                ):
-                    return [Completion(**d) for d in data]
-                return [Response(**d) for d in data]
+                return [_convert_data(d) for d in data]
             raise NotImplementedError
 
         def eq_token(token: Token, other: Token) -> bool:
@@ -119,6 +128,7 @@ class ResponseComparator(JSONSnapshotExtension):
                 and token.text == other.text
                 and (
                     self.ignore_logprob
+                    or (token.logprob == other.logprob and token.logprob is None)
                     or math.isclose(token.logprob, other.logprob, rel_tol=self.rtol)
                 )
                 and token.special == other.special
@@ -257,7 +267,7 @@ class IgnoreLogProbResponseComparator(ResponseComparator):
 
 class LauncherHandle:
     def __init__(self, port: int):
-        self.client = AsyncClient(f"http://localhost:{port}")
+        self.client = AsyncClient(f"http://localhost:{port}", timeout=30)
 
     def _inner_health(self):
         raise NotImplementedError
@@ -271,7 +281,7 @@ class LauncherHandle:
             try:
                 await self.client.generate("test")
                 return
-            except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
                 time.sleep(1)
         raise RuntimeError("Health check failed")
 
@@ -329,10 +339,14 @@ def launcher(event_loop):
         use_flash_attention: bool = True,
         disable_grammar_support: bool = False,
         dtype: Optional[str] = None,
+        kv_cache_dtype: Optional[str] = None,
         revision: Optional[str] = None,
         max_input_length: Optional[int] = None,
         max_batch_prefill_tokens: Optional[int] = None,
         max_total_tokens: Optional[int] = None,
+        lora_adapters: Optional[List[str]] = None,
+        cuda_graphs: Optional[List[int]] = None,
+        attention: Optional[str] = None,
     ):
         port = random.randint(8000, 10_000)
         master_port = random.randint(10_000, 20_000)
@@ -365,6 +379,9 @@ def launcher(event_loop):
         if dtype is not None:
             args.append("--dtype")
             args.append(dtype)
+        if kv_cache_dtype is not None:
+            args.append("--kv-cache-dtype")
+            args.append(kv_cache_dtype)
         if revision is not None:
             args.append("--revision")
             args.append(revision)
@@ -379,11 +396,22 @@ def launcher(event_loop):
         if max_total_tokens:
             args.append("--max-total-tokens")
             args.append(str(max_total_tokens))
+        if lora_adapters:
+            args.append("--lora-adapters")
+            args.append(",".join(lora_adapters))
+        if cuda_graphs:
+            args.append("--cuda-graphs")
+            args.append(",".join(map(str, cuda_graphs)))
+
+        print(" ".join(args), file=sys.stderr)
 
         env["LOG_LEVEL"] = "info,text_generation_router=debug"
+        env["PREFILL_CHUNKING"] = "1"
 
         if not use_flash_attention:
             env["USE_FLASH_ATTENTION"] = "false"
+        if attention is not None:
+            env["ATTENTION"] = attention
 
         with tempfile.TemporaryFile("w+") as tmp:
             # We'll output stdout/stderr to a temporary file. Using a pipe
@@ -414,10 +442,14 @@ def launcher(event_loop):
         use_flash_attention: bool = True,
         disable_grammar_support: bool = False,
         dtype: Optional[str] = None,
+        kv_cache_dtype: Optional[str] = None,
         revision: Optional[str] = None,
         max_input_length: Optional[int] = None,
         max_batch_prefill_tokens: Optional[int] = None,
         max_total_tokens: Optional[int] = None,
+        lora_adapters: Optional[List[str]] = None,
+        cuda_graphs: Optional[List[int]] = None,
+        attention: Optional[str] = None,
     ):
         port = random.randint(8000, 10_000)
 
@@ -433,6 +465,9 @@ def launcher(event_loop):
         if dtype is not None:
             args.append("--dtype")
             args.append(dtype)
+        if kv_cache_dtype is not None:
+            args.append("--kv-cache-dtype")
+            args.append(kv_cache_dtype)
         if revision is not None:
             args.append("--revision")
             args.append(revision)
@@ -447,6 +482,12 @@ def launcher(event_loop):
         if max_total_tokens:
             args.append("--max-total-tokens")
             args.append(str(max_total_tokens))
+        if lora_adapters:
+            args.append("--lora-adapters")
+            args.append(",".join(lora_adapters))
+        if cuda_graphs:
+            args.append("--cuda-graphs")
+            args.append(",".join(map(str, cuda_graphs)))
 
         client = docker.from_env()
 
@@ -455,6 +496,7 @@ def launcher(event_loop):
         try:
             container = client.containers.get(container_name)
             container.stop()
+            container.remove()
             container.wait()
         except NotFound:
             pass
@@ -463,9 +505,12 @@ def launcher(event_loop):
 
         env = {
             "LOG_LEVEL": "info,text_generation_router=debug",
+            "PREFILL_CHUNKING": "1",
         }
         if not use_flash_attention:
             env["USE_FLASH_ATTENTION"] = "false"
+        if attention is not None:
+            env["ATTENTION"] = attention
 
         if HF_TOKEN is not None:
             env["HF_TOKEN"] = HF_TOKEN
@@ -475,13 +520,28 @@ def launcher(event_loop):
             volumes = [f"{DOCKER_VOLUME}:/data"]
 
         if DOCKER_DEVICES:
-            devices = DOCKER_DEVICES.split(",")
+            if DOCKER_DEVICES.lower() == "none":
+                devices = []
+            else:
+                devices = DOCKER_DEVICES.strip().split(",")
             visible = os.getenv("ROCR_VISIBLE_DEVICES")
             if visible:
                 env["ROCR_VISIBLE_DEVICES"] = visible
             device_requests = []
+            if not devices:
+                devices = None
+            elif devices == ["nvidia.com/gpu=all"]:
+                devices = None
+                device_requests = [
+                    docker.types.DeviceRequest(
+                        driver="cdi",
+                        # count=gpu_count,
+                        device_ids=[f"nvidia.com/gpu={i}"],
+                    )
+                    for i in range(gpu_count)
+                ]
         else:
-            devices = []
+            devices = None
             device_requests = [
                 docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
             ]
@@ -497,24 +557,30 @@ def launcher(event_loop):
             devices=devices,
             volumes=volumes,
             ports={"80/tcp": port},
+            healthcheck={"timeout": int(10 * 1e9)},
             shm_size="1G",
         )
 
-        yield ContainerLauncherHandle(client, container.name, port)
+        try:
+            yield ContainerLauncherHandle(client, container.name, port)
 
-        if not use_flash_attention:
-            del env["USE_FLASH_ATTENTION"]
+            if not use_flash_attention:
+                del env["USE_FLASH_ATTENTION"]
 
-        try:
-            container.stop()
-            container.wait()
-        except NotFound:
-            pass
+            try:
+                container.stop()
+                container.wait()
+            except NotFound:
+                pass
 
-        container_output = container.logs().decode("utf-8")
-        print(container_output, file=sys.stderr)
+            container_output = container.logs().decode("utf-8")
+            print(container_output, file=sys.stderr)
 
-        container.remove()
+        finally:
+            try:
+                container.remove()
+            except Exception:
+                pass
 
     if DOCKER_IMAGE is not None:
         return docker_launcher
@@ -547,3 +613,56 @@ def generate_load():
         return await asyncio.gather(*futures)
 
     return generate_load_inner
+
+
+@pytest.fixture(scope="module")
+def generate_multi():
+    async def generate_load_inner(
+        client: AsyncClient,
+        prompts: List[str],
+        max_new_tokens: int,
+        seed: Optional[int] = None,
+    ) -> List[Response]:
+        import numpy as np
+
+        arange = np.arange(len(prompts))
+        perm = np.random.permutation(arange)
+        rperm = [-1] * len(perm)
+        for i, p in enumerate(perm):
+            rperm[p] = i
+
+        shuffled_prompts = [prompts[p] for p in perm]
+        futures = [
+            client.chat(
+                messages=[Message(role="user", content=prompt)],
+                max_tokens=max_new_tokens,
+                temperature=0,
+                seed=seed,
+            )
+            for prompt in shuffled_prompts
+        ]
+
+        shuffled_responses = await asyncio.gather(*futures)
+        responses = [shuffled_responses[p] for p in rperm]
+        return responses
+
+    return generate_load_inner
+
+
+# TODO fix the server parsser to count inline image tokens correctly
+@pytest.fixture
+def chicken():
+    path = Path(__file__).parent / "images" / "chicken_on_money.png"
+
+    with open(path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+@pytest.fixture
+def cow_beach():
+    path = Path(__file__).parent / "images" / "cow_beach.png"
+
+    with open(path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
index 53a4ab854dc9b52374f4157aa18a4535de77b3e8..54c66408db0f6c25555706355cd646f6b56ecbad 100644
--- a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
+++ b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
@@ -11,42 +11,42 @@
       },
       {
         "id": 49833,
-        "logprob": -10.5625,
+        "logprob": -10.5703125,
         "text": " dég"
       },
       {
         "id": 21543,
-        "logprob": -0.14770508,
+        "logprob": -0.14746094,
         "text": "uster"
       },
       {
         "id": 447,
-        "logprob": -1.9287109,
+        "logprob": -1.9277344,
         "text": " un"
       },
       {
         "id": 46341,
-        "logprob": -15.4609375,
+        "logprob": -15.421875,
         "text": " ort"
       },
       {
         "id": 35567,
-        "logprob": -7.5585938,
+        "logprob": -7.5820312,
         "text": "olan"
       },
       {
         "id": 15,
-        "logprob": -1.4003906,
+        "logprob": -1.4013672,
         "text": ","
       },
       {
         "id": 1669,
-        "logprob": -1.5673828,
+        "logprob": -1.5595703,
         "text": " il"
       },
       {
         "id": 11580,
-        "logprob": -0.94628906,
+        "logprob": -0.9428711,
         "text": " faut"
       },
       {
@@ -56,7 +56,7 @@
       },
       {
         "id": 39261,
-        "logprob": -1.5732422,
+        "logprob": -1.7763672,
         "text": " d'abord"
       }
     ],
@@ -64,65 +64,66 @@
     "tokens": [
       {
         "id": 578,
-        "logprob": -1.6591797,
+        "logprob": -1.7822266,
         "special": false,
         "text": " le"
       },
       {
         "id": 5608,
-        "logprob": -2.4492188,
+        "logprob": -2.4882812,
         "special": false,
         "text": " faire"
       },
       {
-        "id": 159570,
-        "logprob": -6.6835938,
+        "id": 7735,
+        "logprob": -2.4199219,
         "special": false,
-        "text": " réch"
+        "text": " fond"
       },
       {
-        "id": 810,
+        "id": 289,
         "logprob": 0.0,
         "special": false,
-        "text": "au"
+        "text": "re"
       },
       {
-        "id": 12736,
-        "logprob": 0.0,
+        "id": 693,
+        "logprob": -2.4628906,
         "special": false,
-        "text": "ffer"
+        "text": " à"
       },
       {
-        "id": 1742,
-        "logprob": -2.5175781,
+        "id": 366,
+        "logprob": -1.1308594,
         "special": false,
-        "text": " au"
+        "text": " la"
       },
       {
-        "id": 6105,
-        "logprob": -2.0078125,
+        "id": 48844,
+        "logprob": -1.7900391,
         "special": false,
-        "text": " bain"
+        "text": " cass"
       },
       {
-        "id": 88254,
-        "logprob": -0.12695312,
+        "id": 1744,
+        "logprob": 0.0,
         "special": false,
-        "text": "-mar"
+        "text": "ero"
       },
       {
-        "id": 641,
+        "id": 327,
         "logprob": 0.0,
         "special": false,
-        "text": "ie"
+        "text": "le"
       },
       {
         "id": 2940,
-        "logprob": -3.5175781,
+        "logprob": -1.9306641,
         "special": false,
         "text": " avec"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": " le faire réchauffer au bain-marie avec"
+  "generated_text": " le faire fondre à la casserole avec"
 }
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
index ace7341603be277c44ee7fa7e32047288052e550..9422f27ff3920f6d46c2dcc0cb091ba93eba5533 100644
--- a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
+++ b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
@@ -11,7 +11,7 @@
       },
       {
         "id": 1669,
-        "logprob": -5.4414062,
+        "logprob": -5.4453125,
         "text": " il"
       },
       {
@@ -21,12 +21,12 @@
       },
       {
         "id": 3913,
-        "logprob": -4.3554688,
+        "logprob": -4.3320312,
         "text": " tout"
       },
       {
         "id": 39261,
-        "logprob": -2.9238281,
+        "logprob": -2.9160156,
         "text": " d'abord"
       }
     ],
@@ -34,65 +34,66 @@
     "tokens": [
       {
         "id": 408,
-        "logprob": -0.07891846,
+        "logprob": -0.16687012,
         "special": false,
         "text": " que"
       },
       {
         "id": 366,
-        "logprob": -1.2939453,
+        "logprob": -1.5517578,
         "special": false,
         "text": " la"
       },
       {
         "id": 8769,
-        "logprob": -0.3708496,
+        "logprob": -0.16687012,
         "special": false,
         "text": " personne"
       },
       {
         "id": 1479,
-        "logprob": -2.2871094,
+        "logprob": -2.1035156,
         "special": false,
         "text": " qui"
       },
       {
-        "id": 2997,
-        "logprob": -0.8671875,
+        "id": 143926,
+        "logprob": -2.8671875,
         "special": false,
-        "text": " vous"
+        "text": " réalise"
       },
       {
-        "id": 35977,
-        "logprob": -1.5097656,
+        "id": 578,
+        "logprob": 0.0,
         "special": false,
-        "text": " suit"
+        "text": " le"
       },
       {
-        "id": 21558,
-        "logprob": -0.07891846,
+        "id": 8138,
+        "logprob": -0.66748047,
         "special": false,
-        "text": " ait"
+        "text": " projet"
       },
       {
-        "id": 447,
-        "logprob": -0.12695312,
+        "id": 795,
+        "logprob": -1.6279297,
         "special": false,
-        "text": " un"
+        "text": " ne"
       },
       {
-        "id": 78606,
-        "logprob": -2.21875,
+        "id": 9802,
+        "logprob": -0.47875977,
         "special": false,
-        "text": " profil"
+        "text": " soit"
       },
       {
-        "id": 3899,
-        "logprob": -1.3535156,
+        "id": 1230,
+        "logprob": 0.0,
         "special": false,
-        "text": " bien"
+        "text": " pas"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "Pour déguster un ortolan, il faut tout d'abord que la personne qui vous suit ait un profil bien"
+  "generated_text": "Pour déguster un ortolan, il faut tout d'abord que la personne qui réalise le projet ne soit pas"
 }
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
index dd8936afda115e714ae0ed12737a4dec0b0c5727..b17c889e8194182d916eb6a3afd855242a6bff20 100644
--- a/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
+++ b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
@@ -11,52 +11,52 @@
       },
       {
         "id": 49833,
-        "logprob": -10.5390625,
+        "logprob": -10.546875,
         "text": " dég"
       },
       {
         "id": 21543,
-        "logprob": -0.14758301,
+        "logprob": -0.14819336,
         "text": "uster"
       },
       {
         "id": 447,
-        "logprob": -1.9296875,
+        "logprob": -1.9257812,
         "text": " un"
       },
       {
         "id": 46341,
-        "logprob": -15.4453125,
+        "logprob": -15.4296875,
         "text": " ort"
       },
       {
         "id": 35567,
-        "logprob": -7.59375,
+        "logprob": -7.5625,
         "text": "olan"
       },
       {
         "id": 15,
-        "logprob": -1.3994141,
+        "logprob": -1.4199219,
         "text": ","
       },
       {
         "id": 1669,
-        "logprob": -1.578125,
+        "logprob": -1.5634766,
         "text": " il"
       },
       {
         "id": 11580,
-        "logprob": -0.9453125,
+        "logprob": -0.9458008,
         "text": " faut"
       },
       {
         "id": 3913,
-        "logprob": -3.7011719,
+        "logprob": -3.6816406,
         "text": " tout"
       },
       {
         "id": 39261,
-        "logprob": -1.5732422,
+        "logprob": -1.7753906,
         "text": " d'abord"
       }
     ],
@@ -64,65 +64,66 @@
     "tokens": [
       {
         "id": 578,
-        "logprob": -1.6474609,
+        "logprob": -1.828125,
         "special": false,
         "text": " le"
       },
       {
         "id": 5608,
-        "logprob": -2.5097656,
+        "logprob": -2.5546875,
         "special": false,
         "text": " faire"
       },
       {
-        "id": 159570,
-        "logprob": -6.65625,
+        "id": 7735,
+        "logprob": -2.4277344,
         "special": false,
-        "text": " réch"
+        "text": " fond"
       },
       {
-        "id": 810,
+        "id": 289,
         "logprob": 0.0,
         "special": false,
-        "text": "au"
+        "text": "re"
       },
       {
-        "id": 12736,
-        "logprob": 0.0,
+        "id": 693,
+        "logprob": -2.4472656,
         "special": false,
-        "text": "ffer"
+        "text": " à"
       },
       {
-        "id": 1742,
-        "logprob": -2.5859375,
+        "id": 366,
+        "logprob": -1.1494141,
         "special": false,
-        "text": " au"
+        "text": " la"
       },
       {
-        "id": 6105,
-        "logprob": -2.03125,
+        "id": 48844,
+        "logprob": -1.7939453,
         "special": false,
-        "text": " bain"
+        "text": " cass"
       },
       {
-        "id": 88254,
-        "logprob": -0.12695312,
+        "id": 1744,
+        "logprob": 0.0,
         "special": false,
-        "text": "-mar"
+        "text": "ero"
       },
       {
-        "id": 641,
+        "id": 327,
         "logprob": 0.0,
         "special": false,
-        "text": "ie"
+        "text": "le"
       },
       {
         "id": 2940,
-        "logprob": -3.5175781,
+        "logprob": -1.9013672,
         "special": false,
         "text": " avec"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": " le faire réchauffer au bain-marie avec"
+  "generated_text": " le faire fondre à la casserole avec"
 }
diff --git a/integration-tests/models/__snapshots__/test_chat_llama/test_flash_llama_simple.json b/integration-tests/models/__snapshots__/test_chat_llama/test_flash_llama_simple.json
index 8631c076041caf24fa7360058bc465d2fe781e3c..5553e17dd800a5a5eeb658b55c3278cbd9ee1651 100644
--- a/integration-tests/models/__snapshots__/test_chat_llama/test_flash_llama_simple.json
+++ b/integration-tests/models/__snapshots__/test_chat_llama/test_flash_llama_simple.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to explore in the middle of urban confines. In fact, typical times for humidity levels in Brooklyn include:\n\n- Early morning: 80-85% humidity, with occas",
+        "content": "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to appreciate nature.\n\nIn terms of temperature, the warmest times of the year are from June to August, when average high temperatures typically range from around 73°F or 23°C",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1716553098,
+  "created": 1724792495,
   "id": "",
   "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-  "object": "text_completion",
-  "system_fingerprint": "2.0.5-dev0-native",
+  "object": "chat.completion",
+  "system_fingerprint": "2.2.1-dev0-native",
   "usage": {
     "completion_tokens": 100,
-    "prompt_tokens": 62,
-    "total_tokens": 162
+    "prompt_tokens": 61,
+    "total_tokens": 161
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
index 99c33cf7582162ff3d3ecc32041a95a50ce0da2c..25b8120d4401279230705e1256df112c94754320 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@@ -1,38 +1,38 @@
 {
   "choices": [
     {
-      "finish_reason": "eos_token",
-      "index": 1,
+      "finish_reason": "length",
+      "index": 0,
       "logprobs": null,
-      "text": " PR for more information?"
+      "text": " A Beginner’s Guide\nDeep learning is a subset"
     },
     {
       "finish_reason": "length",
-      "index": 0,
+      "index": 1,
       "logprobs": null,
-      "text": "le Business Incubator is providing a workspace"
+      "text": " This is a question that has puzzled many people for"
     },
     {
       "finish_reason": "length",
-      "index": 2,
+      "index": 3,
       "logprobs": null,
-      "text": " severely flawed and often has a substandard"
+      "text": "usculas_minusculas(s):\n    \"\"\"\n"
     },
     {
       "finish_reason": "length",
-      "index": 3,
+      "index": 2,
       "logprobs": null,
-      "text": "hd20220811-"
+      "text": " Paris\nWhat is the capital of France?\nThe"
     }
   ],
-  "created": 1713284455,
+  "created": 1725877154,
   "id": "",
-  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
   "object": "text_completion",
-  "system_fingerprint": "2.0.1-native",
+  "system_fingerprint": "2.2.1-dev0-native",
   "usage": {
-    "completion_tokens": 36,
-    "prompt_tokens": 8,
-    "total_tokens": 44
+    "completion_tokens": 40,
+    "prompt_tokens": 22,
+    "total_tokens": 62
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
index d87071cfac5191f02b15ff375ceed991ae73cc11..dd22ceae1fc91379c120228262e6d4f44dc38cad 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
@@ -5,14 +5,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": "\n"
+        "text": " A"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -20,14 +20,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": "\n"
+        "text": " This"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -35,14 +35,14 @@
         "finish_reason": "",
         "index": 2,
         "logprobs": null,
-        "text": "\n"
+        "text": " Paris"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -50,14 +50,14 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "hd"
+        "text": "us"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -65,14 +65,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": "\n"
+        "text": " Beginner"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -80,14 +80,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": "\n"
+        "text": " is"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -98,11 +98,11 @@
         "text": "\n"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -110,14 +110,14 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "aho"
+        "text": "cul"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -125,14 +125,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": "2"
+        "text": "’s"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -140,14 +140,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": "2"
+        "text": " a"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -155,14 +155,14 @@
         "finish_reason": "",
         "index": 2,
         "logprobs": null,
-        "text": "2"
+        "text": "What"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -170,14 +170,14 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "ima"
+        "text": "as"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -185,14 +185,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": "."
+        "text": " Guide"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -200,14 +200,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": "."
+        "text": " question"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -215,14 +215,14 @@
         "finish_reason": "",
         "index": 2,
         "logprobs": null,
-        "text": "."
+        "text": " is"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -230,14 +230,14 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "\n"
+        "text": "_minus"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -245,14 +245,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": " Sarah"
+        "text": "\n"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -260,14 +260,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": " Yes"
+        "text": " that"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -275,14 +275,14 @@
         "finish_reason": "",
         "index": 2,
         "logprobs": null,
-        "text": " And"
+        "text": " the"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -290,14 +290,14 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "i"
+        "text": "cul"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -305,14 +305,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": "'"
+        "text": "Deep"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -320,14 +320,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": ","
+        "text": " has"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -335,14 +335,14 @@
         "finish_reason": "",
         "index": 2,
         "logprobs": null,
-        "text": " what"
+        "text": " capital"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -350,14 +350,14 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "'"
+        "text": "as"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -365,14 +365,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": "s"
+        "text": " learning"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -380,14 +380,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": " Moh"
+        "text": " puzzled"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -395,14 +395,14 @@
         "finish_reason": "",
         "index": 2,
         "logprobs": null,
-        "text": " is"
+        "text": " of"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -410,14 +410,14 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": "m"
+        "text": "(s"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -425,14 +425,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": " Room"
+        "text": " is"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -440,14 +440,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": "s"
+        "text": " many"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -455,14 +455,14 @@
         "finish_reason": "",
         "index": 2,
         "logprobs": null,
-        "text": " the"
+        "text": " France"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -470,14 +470,14 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": " tired"
+        "text": "):\n"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -485,14 +485,14 @@
         "finish_reason": "",
         "index": 0,
         "logprobs": null,
-        "text": ":"
+        "text": " a"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -500,14 +500,14 @@
         "finish_reason": "",
         "index": 1,
         "logprobs": null,
-        "text": "'"
+        "text": " people"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -515,14 +515,14 @@
         "finish_reason": "",
         "index": 2,
         "logprobs": null,
-        "text": " capital"
+        "text": "?\n"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
@@ -530,73 +530,73 @@
         "finish_reason": "",
         "index": 3,
         "logprobs": null,
-        "text": " of"
+        "text": "   "
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
       {
-        "finish_reason": "",
+        "finish_reason": "length",
         "index": 0,
         "logprobs": null,
-        "text": " She"
+        "text": " subset"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
       {
-        "finish_reason": "",
+        "finish_reason": "length",
         "index": 1,
         "logprobs": null,
-        "text": " scale"
+        "text": " for"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
       {
-        "finish_reason": "",
+        "finish_reason": "length",
         "index": 2,
         "logprobs": null,
-        "text": " of"
+        "text": "The"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   },
   {
     "choices": [
       {
-        "finish_reason": "",
+        "finish_reason": "length",
         "index": 3,
         "logprobs": null,
-        "text": " being"
+        "text": " \"\"\"\n"
       }
     ],
-    "created": 1713284431,
+    "created": 1725883643,
     "id": "",
-    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.0.1-native"
+    "system_fingerprint": "2.2.1-dev0-native"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
index 5aed493563e85e293d01deca40a4464da93a015a..7ad562714c9eb653a7372e3838b0c1786b09e5c6 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
@@ -4,17 +4,17 @@
       "finish_reason": "length",
       "index": 0,
       "logprobs": null,
-      "text": " PR for flake8"
+      "text": " A Beginner’s Guide\nDeep learning is a subset"
     }
   ],
-  "created": 1713284454,
+  "created": 1725876621,
   "id": "",
-  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
   "object": "text_completion",
-  "system_fingerprint": "2.0.1-native",
+  "system_fingerprint": "2.2.1-dev0-native",
   "usage": {
-    "completion_tokens": 5,
+    "completion_tokens": 10,
     "prompt_tokens": 6,
-    "total_tokens": 11
+    "total_tokens": 16
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c7be4cb1ec55c20a5f2783d184d374c9cb33213
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
@@ -0,0 +1,206 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "**",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "Deep",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Learning",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ":",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " An",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Overview",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "**\n",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656044,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "================================",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656044,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "=====",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656044,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "\n\n",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656044,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 40,
+      "total_tokens": 50
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2.json b/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2.json
new file mode 100644
index 0000000000000000000000000000000000000000..732b0c499d59eadcdf0f85d3ca5d86daa8661cf6
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 100000,
+        "logprob": null,
+        "text": "<｜begin▁of▁sentence｜>"
+      },
+      {
+        "id": 3533,
+        "logprob": -9.625,
+        "text": "Test"
+      },
+      {
+        "id": 3102,
+        "logprob": -11.25,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 185,
+        "logprob": -1.546875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 549,
+        "logprob": -2.859375,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 1727,
+        "logprob": -2.484375,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 3102,
+        "logprob": -0.83203125,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 317,
+        "logprob": -1.1484375,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 245,
+        "logprob": -1.578125,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 3412,
+        "logprob": -2.578125,
+        "special": false,
+        "text": " document"
+      },
+      {
+        "id": 344,
+        "logprob": -1.125,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 317,
+        "logprob": -1.6953125,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 1222,
+        "logprob": -1.71875,
+        "special": false,
+        "text": " used"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nThe test request is a document that is used"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_all_params.json b/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b45cf6b9b429340657cf9c9de6216c7fcc30370
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_all_params.json
@@ -0,0 +1,53 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 4,
+    "prefill": [
+      {
+        "id": 100000,
+        "logprob": null,
+        "text": "<｜begin▁of▁sentence｜>"
+      },
+      {
+        "id": 3533,
+        "logprob": -9.625,
+        "text": "Test"
+      },
+      {
+        "id": 3102,
+        "logprob": -11.25,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 2143,
+        "logprob": -1.828125,
+        "special": false,
+        "text": " sent"
+      },
+      {
+        "id": 10081,
+        "logprob": -0.41210938,
+        "special": false,
+        "text": " successfully"
+      },
+      {
+        "id": 13,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 100001,
+        "logprob": -0.16015625,
+        "special": true,
+        "text": "<｜end▁of▁sentence｜>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request sent successfully."
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_load.json b/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1eeab25ca95cfd4b4fdd58355b5588968534849
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 100000,
+          "logprob": null,
+          "text": "<｜begin▁of▁sentence｜>"
+        },
+        {
+          "id": 3533,
+          "logprob": -9.625,
+          "text": "Test"
+        },
+        {
+          "id": 3102,
+          "logprob": -11.25,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 185,
+          "logprob": -1.546875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 549,
+          "logprob": -2.859375,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 1727,
+          "logprob": -2.4375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3102,
+          "logprob": -0.83984375,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 317,
+          "logprob": -1.1328125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 254,
+          "logprob": -1.515625,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 1022,
+          "logprob": -1.15625,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 3458,
+          "logprob": -0.3671875,
+          "special": false,
+          "text": " step"
+        },
+        {
+          "id": 279,
+          "logprob": -0.88671875,
+          "special": false,
+          "text": " in"
+        },
+        {
+          "id": 254,
+          "logprob": -0.69140625,
+          "special": false,
+          "text": " the"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nThe test request is the first step in the"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 100000,
+          "logprob": null,
+          "text": "<｜begin▁of▁sentence｜>"
+        },
+        {
+          "id": 3533,
+          "logprob": -9.625,
+          "text": "Test"
+        },
+        {
+          "id": 3102,
+          "logprob": -11.25,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 185,
+          "logprob": -1.546875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 549,
+          "logprob": -2.859375,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 1727,
+          "logprob": -2.4375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3102,
+          "logprob": -0.83984375,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 317,
+          "logprob": -1.1328125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 254,
+          "logprob": -1.515625,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 1022,
+          "logprob": -1.15625,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 3458,
+          "logprob": -0.3671875,
+          "special": false,
+          "text": " step"
+        },
+        {
+          "id": 279,
+          "logprob": -0.88671875,
+          "special": false,
+          "text": " in"
+        },
+        {
+          "id": 254,
+          "logprob": -0.69140625,
+          "special": false,
+          "text": " the"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nThe test request is the first step in the"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 100000,
+          "logprob": null,
+          "text": "<｜begin▁of▁sentence｜>"
+        },
+        {
+          "id": 3533,
+          "logprob": -9.625,
+          "text": "Test"
+        },
+        {
+          "id": 3102,
+          "logprob": -11.25,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 185,
+          "logprob": -1.546875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 549,
+          "logprob": -2.859375,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 1727,
+          "logprob": -2.4375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3102,
+          "logprob": -0.83984375,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 317,
+          "logprob": -1.1328125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 254,
+          "logprob": -1.515625,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 1022,
+          "logprob": -1.15625,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 3458,
+          "logprob": -0.3671875,
+          "special": false,
+          "text": " step"
+        },
+        {
+          "id": 279,
+          "logprob": -0.88671875,
+          "special": false,
+          "text": " in"
+        },
+        {
+          "id": 254,
+          "logprob": -0.69140625,
+          "special": false,
+          "text": " the"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nThe test request is the first step in the"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 100000,
+          "logprob": null,
+          "text": "<｜begin▁of▁sentence｜>"
+        },
+        {
+          "id": 3533,
+          "logprob": -9.625,
+          "text": "Test"
+        },
+        {
+          "id": 3102,
+          "logprob": -11.25,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 185,
+          "logprob": -1.546875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 549,
+          "logprob": -2.859375,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 1727,
+          "logprob": -2.4375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3102,
+          "logprob": -0.83984375,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 317,
+          "logprob": -1.1328125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 254,
+          "logprob": -1.515625,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 1022,
+          "logprob": -1.15625,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 3458,
+          "logprob": -0.3671875,
+          "special": false,
+          "text": " step"
+        },
+        {
+          "id": 279,
+          "logprob": -0.88671875,
+          "special": false,
+          "text": " in"
+        },
+        {
+          "id": 254,
+          "logprob": -0.69140625,
+          "special": false,
+          "text": " the"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nThe test request is the first step in the"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_all_params.json b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_all_params.json
index 8253dc9653aab559ed0ac7ca707f95df9fb6880c..0b840bfdaba4483748d28e951f96906f4d3f904d 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_all_params.json
@@ -11,12 +11,12 @@
       },
       {
         "id": 2015,
-        "logprob": -10.0,
+        "logprob": -10.0625,
         "text": "Test"
       },
       {
         "id": 3853,
-        "logprob": -10.875,
+        "logprob": -11.0,
         "text": " request"
       }
     ],
@@ -24,7 +24,7 @@
     "tokens": [
       {
         "id": 7539,
-        "logprob": -0.73046875,
+        "logprob": -0.609375,
         "special": false,
         "text": " forms"
       },
@@ -36,7 +36,7 @@
       },
       {
         "id": 671,
-        "logprob": -1.703125,
+        "logprob": -1.5546875,
         "special": false,
         "text": " an"
       },
@@ -66,24 +66,24 @@
       },
       {
         "id": 11859,
-        "logprob": -1.6953125,
+        "logprob": -1.953125,
         "special": false,
         "text": " lab"
       },
       {
         "id": 2185,
-        "logprob": -1.3125,
+        "logprob": -1.7734375,
         "special": false,
         "text": " process"
       },
       {
-        "id": 578,
-        "logprob": -1.5,
+        "id": 235265,
+        "logprob": 0.0,
         "special": false,
-        "text": " and"
+        "text": "."
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Test request forms are an essential part of the lab process and"
+  "generated_text": "Test request forms are an essential part of the lab process."
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma.json b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_simple.json
similarity index 80%
rename from integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma.json
rename to integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_simple.json
index 80f0d053d8795d021f123a9445880cbf52b3518c..e322446bb5bde49bd498932e62212203f912a69a 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_simple.json
@@ -11,12 +11,12 @@
       },
       {
         "id": 2015,
-        "logprob": -10.0,
+        "logprob": -10.0625,
         "text": "Test"
       },
       {
         "id": 3853,
-        "logprob": -10.875,
+        "logprob": -11.0,
         "text": " request"
       }
     ],
@@ -24,13 +24,13 @@
     "tokens": [
       {
         "id": 1736,
-        "logprob": -2.09375,
+        "logprob": -2.109375,
         "special": false,
         "text": " form"
       },
       {
         "id": 109,
-        "logprob": -1.8671875,
+        "logprob": -1.90625,
         "special": false,
         "text": "\n\n"
       },
@@ -42,43 +42,43 @@
       },
       {
         "id": 2121,
-        "logprob": -1.8203125,
+        "logprob": -1.796875,
         "special": false,
         "text": " test"
       },
       {
         "id": 3853,
-        "logprob": -0.23242188,
+        "logprob": -0.24511719,
         "special": false,
         "text": " request"
       },
       {
         "id": 1736,
-        "logprob": -0.08544922,
+        "logprob": -0.09326172,
         "special": false,
         "text": " form"
       },
       {
         "id": 603,
-        "logprob": -0.9375,
+        "logprob": -0.95703125,
         "special": false,
         "text": " is"
       },
       {
         "id": 1671,
-        "logprob": -1.671875,
+        "logprob": -1.5859375,
         "special": false,
         "text": " used"
       },
       {
         "id": 577,
-        "logprob": -0.40429688,
+        "logprob": -0.39257812,
         "special": false,
         "text": " to"
       },
       {
         "id": 3853,
-        "logprob": -1.1875,
+        "logprob": -1.25,
         "special": false,
         "text": " request"
       }
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma2/test_flash_gemma2.json b/integration-tests/models/__snapshots__/test_flash_gemma2/test_flash_gemma2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e9a50cf4ab8346a3987c23ce0618f23a74faf01
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma2/test_flash_gemma2.json
@@ -0,0 +1,254 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 106,
+        "logprob": -47.25,
+        "text": "<start_of_turn>"
+      },
+      {
+        "id": 1645,
+        "logprob": -18.875,
+        "text": "user"
+      },
+      {
+        "id": 235292,
+        "logprob": -7.15625,
+        "text": ":"
+      },
+      {
+        "id": 108,
+        "logprob": -4.78125,
+        "text": "\n"
+      },
+      {
+        "id": 5559,
+        "logprob": -10.0,
+        "text": "Write"
+      },
+      {
+        "id": 476,
+        "logprob": -0.1171875,
+        "text": " a"
+      },
+      {
+        "id": 19592,
+        "logprob": -2.46875,
+        "text": " poem"
+      },
+      {
+        "id": 577,
+        "logprob": -5.84375,
+        "text": " to"
+      },
+      {
+        "id": 1707,
+        "logprob": -6.375,
+        "text": " help"
+      },
+      {
+        "id": 682,
+        "logprob": -2.125,
+        "text": " me"
+      },
+      {
+        "id": 5434,
+        "logprob": -1.546875,
+        "text": " remember"
+      },
+      {
+        "id": 573,
+        "logprob": -0.62890625,
+        "text": " the"
+      },
+      {
+        "id": 1370,
+        "logprob": -6.65625,
+        "text": " first"
+      },
+      {
+        "id": 235248,
+        "logprob": -1.84375,
+        "text": " "
+      },
+      {
+        "id": 235274,
+        "logprob": -0.45117188,
+        "text": "1"
+      },
+      {
+        "id": 235276,
+        "logprob": -0.07421875,
+        "text": "0"
+      },
+      {
+        "id": 6635,
+        "logprob": -2.109375,
+        "text": " elements"
+      },
+      {
+        "id": 611,
+        "logprob": -0.4140625,
+        "text": " on"
+      },
+      {
+        "id": 573,
+        "logprob": -0.0009536743,
+        "text": " the"
+      },
+      {
+        "id": 26163,
+        "logprob": -0.033203125,
+        "text": " periodic"
+      },
+      {
+        "id": 3037,
+        "logprob": -0.0002670288,
+        "text": " table"
+      },
+      {
+        "id": 235269,
+        "logprob": -4.75,
+        "text": ","
+      },
+      {
+        "id": 7385,
+        "logprob": -11.625,
+        "text": " giving"
+      },
+      {
+        "id": 1853,
+        "logprob": -4.875,
+        "text": " each"
+      },
+      {
+        "id": 5356,
+        "logprob": -0.38867188,
+        "text": " element"
+      },
+      {
+        "id": 1277,
+        "logprob": -3.65625,
+        "text": " its"
+      },
+      {
+        "id": 1997,
+        "logprob": -4.4375,
+        "text": " own"
+      },
+      {
+        "id": 2017,
+        "logprob": -0.29882812,
+        "text": " line"
+      },
+      {
+        "id": 235265,
+        "logprob": -0.16699219,
+        "text": "."
+      },
+      {
+        "id": 107,
+        "logprob": -25.625,
+        "text": "<end_of_turn>"
+      },
+      {
+        "id": 108,
+        "logprob": -6.75,
+        "text": "\n"
+      },
+      {
+        "id": 106,
+        "logprob": -39.5,
+        "text": "<start_of_turn>"
+      },
+      {
+        "id": 2516,
+        "logprob": -32.5,
+        "text": "model"
+      },
+      {
+        "id": 235292,
+        "logprob": -10.125,
+        "text": ":"
+      },
+      {
+        "id": 108,
+        "logprob": -3.421875,
+        "text": "\n"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 688,
+        "logprob": -0.546875,
+        "special": false,
+        "text": "**"
+      },
+      {
+        "id": 103889,
+        "logprob": -0.49023438,
+        "special": false,
+        "text": "Hydrogen"
+      },
+      {
+        "id": 190213,
+        "logprob": -0.48632812,
+        "special": false,
+        "text": "**,"
+      },
+      {
+        "id": 2611,
+        "logprob": -0.58203125,
+        "special": false,
+        "text": " light"
+      },
+      {
+        "id": 578,
+        "logprob": -0.099121094,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 2223,
+        "logprob": -1.078125,
+        "special": false,
+        "text": " free"
+      },
+      {
+        "id": 235269,
+        "logprob": -0.025756836,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 108,
+        "logprob": -0.29101562,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 688,
+        "logprob": -0.0035858154,
+        "special": false,
+        "text": "**"
+      },
+      {
+        "id": 1949,
+        "logprob": -4.1007996e-05,
+        "special": false,
+        "text": "He"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "**Hydrogen**, light and free,\n**He"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma2/test_flash_gemma2_load.json b/integration-tests/models/__snapshots__/test_flash_gemma2/test_flash_gemma2_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c47dd3cc0b583d723f7d2c81872c0ce0d163d13
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma2/test_flash_gemma2_load.json
@@ -0,0 +1,1018 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 106,
+          "logprob": -47.25,
+          "text": "<start_of_turn>"
+        },
+        {
+          "id": 1645,
+          "logprob": -18.875,
+          "text": "user"
+        },
+        {
+          "id": 235292,
+          "logprob": -7.25,
+          "text": ":"
+        },
+        {
+          "id": 108,
+          "logprob": -4.78125,
+          "text": "\n"
+        },
+        {
+          "id": 5559,
+          "logprob": -10.0,
+          "text": "Write"
+        },
+        {
+          "id": 476,
+          "logprob": -0.111816406,
+          "text": " a"
+        },
+        {
+          "id": 19592,
+          "logprob": -2.46875,
+          "text": " poem"
+        },
+        {
+          "id": 577,
+          "logprob": -5.78125,
+          "text": " to"
+        },
+        {
+          "id": 1707,
+          "logprob": -6.375,
+          "text": " help"
+        },
+        {
+          "id": 682,
+          "logprob": -2.125,
+          "text": " me"
+        },
+        {
+          "id": 5434,
+          "logprob": -1.59375,
+          "text": " remember"
+        },
+        {
+          "id": 573,
+          "logprob": -0.62890625,
+          "text": " the"
+        },
+        {
+          "id": 1370,
+          "logprob": -6.625,
+          "text": " first"
+        },
+        {
+          "id": 235248,
+          "logprob": -1.7421875,
+          "text": " "
+        },
+        {
+          "id": 235274,
+          "logprob": -0.44921875,
+          "text": "1"
+        },
+        {
+          "id": 235276,
+          "logprob": -0.07128906,
+          "text": "0"
+        },
+        {
+          "id": 6635,
+          "logprob": -2.109375,
+          "text": " elements"
+        },
+        {
+          "id": 611,
+          "logprob": -0.40429688,
+          "text": " on"
+        },
+        {
+          "id": 573,
+          "logprob": -0.0009918213,
+          "text": " the"
+        },
+        {
+          "id": 26163,
+          "logprob": -0.03540039,
+          "text": " periodic"
+        },
+        {
+          "id": 3037,
+          "logprob": -0.00028800964,
+          "text": " table"
+        },
+        {
+          "id": 235269,
+          "logprob": -4.71875,
+          "text": ","
+        },
+        {
+          "id": 7385,
+          "logprob": -11.875,
+          "text": " giving"
+        },
+        {
+          "id": 1853,
+          "logprob": -4.875,
+          "text": " each"
+        },
+        {
+          "id": 5356,
+          "logprob": -0.38867188,
+          "text": " element"
+        },
+        {
+          "id": 1277,
+          "logprob": -3.65625,
+          "text": " its"
+        },
+        {
+          "id": 1997,
+          "logprob": -4.4375,
+          "text": " own"
+        },
+        {
+          "id": 2017,
+          "logprob": -0.3046875,
+          "text": " line"
+        },
+        {
+          "id": 235265,
+          "logprob": -0.16113281,
+          "text": "."
+        },
+        {
+          "id": 107,
+          "logprob": -25.625,
+          "text": "<end_of_turn>"
+        },
+        {
+          "id": 108,
+          "logprob": -6.75,
+          "text": "\n"
+        },
+        {
+          "id": 106,
+          "logprob": -39.25,
+          "text": "<start_of_turn>"
+        },
+        {
+          "id": 2516,
+          "logprob": -32.5,
+          "text": "model"
+        },
+        {
+          "id": 235292,
+          "logprob": -10.1875,
+          "text": ":"
+        },
+        {
+          "id": 108,
+          "logprob": -3.296875,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 688,
+          "logprob": -0.546875,
+          "special": false,
+          "text": "**"
+        },
+        {
+          "id": 103889,
+          "logprob": -0.49023438,
+          "special": false,
+          "text": "Hydrogen"
+        },
+        {
+          "id": 190213,
+          "logprob": -0.48632812,
+          "special": false,
+          "text": "**,"
+        },
+        {
+          "id": 2611,
+          "logprob": -0.58203125,
+          "special": false,
+          "text": " light"
+        },
+        {
+          "id": 578,
+          "logprob": -0.08886719,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 2223,
+          "logprob": -1.09375,
+          "special": false,
+          "text": " free"
+        },
+        {
+          "id": 235269,
+          "logprob": -0.024291992,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 108,
+          "logprob": -0.30664062,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 688,
+          "logprob": -0.0035552979,
+          "special": false,
+          "text": "**"
+        },
+        {
+          "id": 1949,
+          "logprob": -4.220009e-05,
+          "special": false,
+          "text": "He"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "**Hydrogen**, light and free,\n**He"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 106,
+          "logprob": -47.25,
+          "text": "<start_of_turn>"
+        },
+        {
+          "id": 1645,
+          "logprob": -18.875,
+          "text": "user"
+        },
+        {
+          "id": 235292,
+          "logprob": -7.25,
+          "text": ":"
+        },
+        {
+          "id": 108,
+          "logprob": -4.78125,
+          "text": "\n"
+        },
+        {
+          "id": 5559,
+          "logprob": -10.0,
+          "text": "Write"
+        },
+        {
+          "id": 476,
+          "logprob": -0.111816406,
+          "text": " a"
+        },
+        {
+          "id": 19592,
+          "logprob": -2.46875,
+          "text": " poem"
+        },
+        {
+          "id": 577,
+          "logprob": -5.78125,
+          "text": " to"
+        },
+        {
+          "id": 1707,
+          "logprob": -6.375,
+          "text": " help"
+        },
+        {
+          "id": 682,
+          "logprob": -2.125,
+          "text": " me"
+        },
+        {
+          "id": 5434,
+          "logprob": -1.59375,
+          "text": " remember"
+        },
+        {
+          "id": 573,
+          "logprob": -0.62890625,
+          "text": " the"
+        },
+        {
+          "id": 1370,
+          "logprob": -6.625,
+          "text": " first"
+        },
+        {
+          "id": 235248,
+          "logprob": -1.7421875,
+          "text": " "
+        },
+        {
+          "id": 235274,
+          "logprob": -0.44921875,
+          "text": "1"
+        },
+        {
+          "id": 235276,
+          "logprob": -0.07128906,
+          "text": "0"
+        },
+        {
+          "id": 6635,
+          "logprob": -2.109375,
+          "text": " elements"
+        },
+        {
+          "id": 611,
+          "logprob": -0.40429688,
+          "text": " on"
+        },
+        {
+          "id": 573,
+          "logprob": -0.0009918213,
+          "text": " the"
+        },
+        {
+          "id": 26163,
+          "logprob": -0.03540039,
+          "text": " periodic"
+        },
+        {
+          "id": 3037,
+          "logprob": -0.00028800964,
+          "text": " table"
+        },
+        {
+          "id": 235269,
+          "logprob": -4.71875,
+          "text": ","
+        },
+        {
+          "id": 7385,
+          "logprob": -11.875,
+          "text": " giving"
+        },
+        {
+          "id": 1853,
+          "logprob": -4.875,
+          "text": " each"
+        },
+        {
+          "id": 5356,
+          "logprob": -0.38867188,
+          "text": " element"
+        },
+        {
+          "id": 1277,
+          "logprob": -3.65625,
+          "text": " its"
+        },
+        {
+          "id": 1997,
+          "logprob": -4.4375,
+          "text": " own"
+        },
+        {
+          "id": 2017,
+          "logprob": -0.3046875,
+          "text": " line"
+        },
+        {
+          "id": 235265,
+          "logprob": -0.16113281,
+          "text": "."
+        },
+        {
+          "id": 107,
+          "logprob": -25.625,
+          "text": "<end_of_turn>"
+        },
+        {
+          "id": 108,
+          "logprob": -6.75,
+          "text": "\n"
+        },
+        {
+          "id": 106,
+          "logprob": -39.25,
+          "text": "<start_of_turn>"
+        },
+        {
+          "id": 2516,
+          "logprob": -32.5,
+          "text": "model"
+        },
+        {
+          "id": 235292,
+          "logprob": -10.1875,
+          "text": ":"
+        },
+        {
+          "id": 108,
+          "logprob": -3.296875,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 688,
+          "logprob": -0.546875,
+          "special": false,
+          "text": "**"
+        },
+        {
+          "id": 103889,
+          "logprob": -0.49023438,
+          "special": false,
+          "text": "Hydrogen"
+        },
+        {
+          "id": 190213,
+          "logprob": -0.48632812,
+          "special": false,
+          "text": "**,"
+        },
+        {
+          "id": 2611,
+          "logprob": -0.58203125,
+          "special": false,
+          "text": " light"
+        },
+        {
+          "id": 578,
+          "logprob": -0.08886719,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 2223,
+          "logprob": -1.09375,
+          "special": false,
+          "text": " free"
+        },
+        {
+          "id": 235269,
+          "logprob": -0.024291992,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 108,
+          "logprob": -0.30664062,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 688,
+          "logprob": -0.0035552979,
+          "special": false,
+          "text": "**"
+        },
+        {
+          "id": 1949,
+          "logprob": -4.220009e-05,
+          "special": false,
+          "text": "He"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "**Hydrogen**, light and free,\n**He"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 106,
+          "logprob": -47.25,
+          "text": "<start_of_turn>"
+        },
+        {
+          "id": 1645,
+          "logprob": -18.875,
+          "text": "user"
+        },
+        {
+          "id": 235292,
+          "logprob": -7.15625,
+          "text": ":"
+        },
+        {
+          "id": 108,
+          "logprob": -4.78125,
+          "text": "\n"
+        },
+        {
+          "id": 5559,
+          "logprob": -10.0,
+          "text": "Write"
+        },
+        {
+          "id": 476,
+          "logprob": -0.1171875,
+          "text": " a"
+        },
+        {
+          "id": 19592,
+          "logprob": -2.46875,
+          "text": " poem"
+        },
+        {
+          "id": 577,
+          "logprob": -5.84375,
+          "text": " to"
+        },
+        {
+          "id": 1707,
+          "logprob": -6.375,
+          "text": " help"
+        },
+        {
+          "id": 682,
+          "logprob": -2.125,
+          "text": " me"
+        },
+        {
+          "id": 5434,
+          "logprob": -1.546875,
+          "text": " remember"
+        },
+        {
+          "id": 573,
+          "logprob": -0.62890625,
+          "text": " the"
+        },
+        {
+          "id": 1370,
+          "logprob": -6.65625,
+          "text": " first"
+        },
+        {
+          "id": 235248,
+          "logprob": -1.84375,
+          "text": " "
+        },
+        {
+          "id": 235274,
+          "logprob": -0.45117188,
+          "text": "1"
+        },
+        {
+          "id": 235276,
+          "logprob": -0.07421875,
+          "text": "0"
+        },
+        {
+          "id": 6635,
+          "logprob": -2.109375,
+          "text": " elements"
+        },
+        {
+          "id": 611,
+          "logprob": -0.4140625,
+          "text": " on"
+        },
+        {
+          "id": 573,
+          "logprob": -0.0009536743,
+          "text": " the"
+        },
+        {
+          "id": 26163,
+          "logprob": -0.033203125,
+          "text": " periodic"
+        },
+        {
+          "id": 3037,
+          "logprob": -0.0002670288,
+          "text": " table"
+        },
+        {
+          "id": 235269,
+          "logprob": -4.75,
+          "text": ","
+        },
+        {
+          "id": 7385,
+          "logprob": -11.625,
+          "text": " giving"
+        },
+        {
+          "id": 1853,
+          "logprob": -4.875,
+          "text": " each"
+        },
+        {
+          "id": 5356,
+          "logprob": -0.38867188,
+          "text": " element"
+        },
+        {
+          "id": 1277,
+          "logprob": -3.65625,
+          "text": " its"
+        },
+        {
+          "id": 1997,
+          "logprob": -4.4375,
+          "text": " own"
+        },
+        {
+          "id": 2017,
+          "logprob": -0.29882812,
+          "text": " line"
+        },
+        {
+          "id": 235265,
+          "logprob": -0.16699219,
+          "text": "."
+        },
+        {
+          "id": 107,
+          "logprob": -25.625,
+          "text": "<end_of_turn>"
+        },
+        {
+          "id": 108,
+          "logprob": -6.75,
+          "text": "\n"
+        },
+        {
+          "id": 106,
+          "logprob": -39.5,
+          "text": "<start_of_turn>"
+        },
+        {
+          "id": 2516,
+          "logprob": -32.5,
+          "text": "model"
+        },
+        {
+          "id": 235292,
+          "logprob": -10.125,
+          "text": ":"
+        },
+        {
+          "id": 108,
+          "logprob": -3.421875,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 688,
+          "logprob": -0.546875,
+          "special": false,
+          "text": "**"
+        },
+        {
+          "id": 103889,
+          "logprob": -0.49023438,
+          "special": false,
+          "text": "Hydrogen"
+        },
+        {
+          "id": 190213,
+          "logprob": -0.48632812,
+          "special": false,
+          "text": "**,"
+        },
+        {
+          "id": 2611,
+          "logprob": -0.58203125,
+          "special": false,
+          "text": " light"
+        },
+        {
+          "id": 578,
+          "logprob": -0.08984375,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 2223,
+          "logprob": -1.1015625,
+          "special": false,
+          "text": " free"
+        },
+        {
+          "id": 235269,
+          "logprob": -0.024291992,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 108,
+          "logprob": -0.30664062,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 688,
+          "logprob": -0.0038452148,
+          "special": false,
+          "text": "**"
+        },
+        {
+          "id": 1949,
+          "logprob": -4.1484833e-05,
+          "special": false,
+          "text": "He"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "**Hydrogen**, light and free,\n**He"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 106,
+          "logprob": -47.25,
+          "text": "<start_of_turn>"
+        },
+        {
+          "id": 1645,
+          "logprob": -18.875,
+          "text": "user"
+        },
+        {
+          "id": 235292,
+          "logprob": -7.25,
+          "text": ":"
+        },
+        {
+          "id": 108,
+          "logprob": -4.78125,
+          "text": "\n"
+        },
+        {
+          "id": 5559,
+          "logprob": -10.0,
+          "text": "Write"
+        },
+        {
+          "id": 476,
+          "logprob": -0.111816406,
+          "text": " a"
+        },
+        {
+          "id": 19592,
+          "logprob": -2.46875,
+          "text": " poem"
+        },
+        {
+          "id": 577,
+          "logprob": -5.78125,
+          "text": " to"
+        },
+        {
+          "id": 1707,
+          "logprob": -6.375,
+          "text": " help"
+        },
+        {
+          "id": 682,
+          "logprob": -2.125,
+          "text": " me"
+        },
+        {
+          "id": 5434,
+          "logprob": -1.59375,
+          "text": " remember"
+        },
+        {
+          "id": 573,
+          "logprob": -0.62890625,
+          "text": " the"
+        },
+        {
+          "id": 1370,
+          "logprob": -6.625,
+          "text": " first"
+        },
+        {
+          "id": 235248,
+          "logprob": -1.7421875,
+          "text": " "
+        },
+        {
+          "id": 235274,
+          "logprob": -0.44921875,
+          "text": "1"
+        },
+        {
+          "id": 235276,
+          "logprob": -0.07128906,
+          "text": "0"
+        },
+        {
+          "id": 6635,
+          "logprob": -2.109375,
+          "text": " elements"
+        },
+        {
+          "id": 611,
+          "logprob": -0.40429688,
+          "text": " on"
+        },
+        {
+          "id": 573,
+          "logprob": -0.0009918213,
+          "text": " the"
+        },
+        {
+          "id": 26163,
+          "logprob": -0.03540039,
+          "text": " periodic"
+        },
+        {
+          "id": 3037,
+          "logprob": -0.00028800964,
+          "text": " table"
+        },
+        {
+          "id": 235269,
+          "logprob": -4.71875,
+          "text": ","
+        },
+        {
+          "id": 7385,
+          "logprob": -11.875,
+          "text": " giving"
+        },
+        {
+          "id": 1853,
+          "logprob": -4.875,
+          "text": " each"
+        },
+        {
+          "id": 5356,
+          "logprob": -0.38867188,
+          "text": " element"
+        },
+        {
+          "id": 1277,
+          "logprob": -3.65625,
+          "text": " its"
+        },
+        {
+          "id": 1997,
+          "logprob": -4.4375,
+          "text": " own"
+        },
+        {
+          "id": 2017,
+          "logprob": -0.3046875,
+          "text": " line"
+        },
+        {
+          "id": 235265,
+          "logprob": -0.16113281,
+          "text": "."
+        },
+        {
+          "id": 107,
+          "logprob": -25.625,
+          "text": "<end_of_turn>"
+        },
+        {
+          "id": 108,
+          "logprob": -6.75,
+          "text": "\n"
+        },
+        {
+          "id": 106,
+          "logprob": -39.25,
+          "text": "<start_of_turn>"
+        },
+        {
+          "id": 2516,
+          "logprob": -32.5,
+          "text": "model"
+        },
+        {
+          "id": 235292,
+          "logprob": -10.1875,
+          "text": ":"
+        },
+        {
+          "id": 108,
+          "logprob": -3.296875,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 688,
+          "logprob": -0.546875,
+          "special": false,
+          "text": "**"
+        },
+        {
+          "id": 103889,
+          "logprob": -0.49023438,
+          "special": false,
+          "text": "Hydrogen"
+        },
+        {
+          "id": 190213,
+          "logprob": -0.48632812,
+          "special": false,
+          "text": "**,"
+        },
+        {
+          "id": 2611,
+          "logprob": -0.58203125,
+          "special": false,
+          "text": " light"
+        },
+        {
+          "id": 578,
+          "logprob": -0.08886719,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 2223,
+          "logprob": -1.09375,
+          "special": false,
+          "text": " free"
+        },
+        {
+          "id": 235269,
+          "logprob": -0.024291992,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 108,
+          "logprob": -0.30664062,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 688,
+          "logprob": -0.0035552979,
+          "special": false,
+          "text": "**"
+        },
+        {
+          "id": 1949,
+          "logprob": -4.220009e-05,
+          "special": false,
+          "text": "He"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "**Hydrogen**, light and free,\n**He"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
index 7a168b2eafb2e38f65a0d6782015c20812c7542a..a7019a43ae705f5a8ca0f3a0e51389e2214dd129 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
@@ -11,7 +11,7 @@
       },
       {
         "id": 2015,
-        "logprob": -9.65625,
+        "logprob": -9.6484375,
         "text": "Test"
       },
       {
@@ -24,66 +24,66 @@
     "tokens": [
       {
         "id": 604,
-        "logprob": -0.36938477,
+        "logprob": -0.28271484,
         "special": false,
         "text": " for"
       },
       {
-        "id": 235248,
-        "logprob": -1.8046875,
+        "id": 573,
+        "logprob": -0.18493652,
         "special": false,
-        "text": " "
+        "text": " the"
       },
       {
-        "id": 235274,
-        "logprob": -0.46240234,
+        "id": 16819,
+        "logprob": -1.4804688,
         "special": false,
-        "text": "1"
+        "text": " detection"
       },
       {
-        "id": 235284,
-        "logprob": -1.7460938,
+        "id": 576,
+        "logprob": -0.7011719,
         "special": false,
-        "text": "2"
+        "text": " of"
       },
       {
-        "id": 235265,
-        "logprob": -1.9443359,
+        "id": 671,
+        "logprob": -2.1738281,
         "special": false,
-        "text": "."
+        "text": " an"
       },
       {
-        "id": 235284,
-        "logprob": -1.4550781,
+        "id": 24646,
+        "logprob": -3.0449219,
         "special": false,
-        "text": "2"
+        "text": " RNA"
       },
       {
-        "id": 235308,
-        "logprob": -1.0205078,
+        "id": 12369,
+        "logprob": -0.19299316,
         "special": false,
-        "text": "5"
+        "text": " virus"
       },
       {
-        "id": 235290,
-        "logprob": -1.0283203,
+        "id": 575,
+        "logprob": -0.10632324,
         "special": false,
-        "text": "-"
+        "text": " in"
       },
       {
-        "id": 235274,
-        "logprob": -1.2783203,
+        "id": 6022,
+        "logprob": -0.98095703,
         "special": false,
-        "text": "1"
+        "text": " patients"
       },
       {
-        "id": 235284,
-        "logprob": 0.0,
+        "id": 1064,
+        "logprob": -1.3095703,
         "special": false,
-        "text": "2"
+        "text": " who"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Test request for 12.25-12"
+  "generated_text": "Test request for the detection of an RNA virus in patients who"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_simple.json
similarity index 100%
rename from integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
rename to integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_simple.json
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8.json
new file mode 100644
index 0000000000000000000000000000000000000000..85cfb91f1eca925b966b79b67356292ff6ecb061
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 2323,
+        "logprob": -9.421875,
+        "text": "Test"
+      },
+      {
+        "id": 1715,
+        "logprob": -10.546875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 369,
+        "logprob": -2.1816406,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 279,
+        "logprob": -2.6992188,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 220,
+        "logprob": -3.6308594,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 679,
+        "logprob": -1.7900391,
+        "special": false,
+        "text": "201"
+      },
+      {
+        "id": 24,
+        "logprob": -1.3554688,
+        "special": false,
+        "text": "9"
+      },
+      {
+        "id": 12,
+        "logprob": -2.0039062,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 2366,
+        "logprob": -0.4489746,
+        "special": false,
+        "text": "202"
+      },
+      {
+        "id": 15,
+        "logprob": -0.037109375,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 2978,
+        "logprob": -0.8100586,
+        "special": false,
+        "text": " school"
+      },
+      {
+        "id": 1060,
+        "logprob": -0.013015747,
+        "special": false,
+        "text": " year"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " for the 2019-2020 school year"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..13c46f5402af1a7f9694c0925c85b1917538617c
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 2323,
+        "logprob": -9.5234375,
+        "text": "Test"
+      },
+      {
+        "id": 1715,
+        "logprob": -10.421875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 25,
+        "logprob": -0.88183594,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 2209,
+        "logprob": -2.6699219,
+        "special": false,
+        "text": " Is"
+      },
+      {
+        "id": 279,
+        "logprob": -0.61083984,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 734,
+        "logprob": -2.6660156,
+        "special": false,
+        "text": " function"
+      },
+      {
+        "id": 330,
+        "logprob": -0.35498047,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 4110,
+        "logprob": -2.4101562,
+        "special": false,
+        "text": "Create"
+      },
+      {
+        "id": 7575,
+        "logprob": -2.2304688,
+        "special": false,
+        "text": "Process"
+      },
+      {
+        "id": 1,
+        "logprob": -0.080078125,
+        "special": false,
+        "text": "\""
+      },
+      {
+        "id": 304,
+        "logprob": -0.75439453,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 12468,
+        "logprob": -1.8769531,
+        "special": false,
+        "text": " Win"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request: Is the function \"CreateProcess\" in Win"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_load.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e9ff6c008492a74da5ee220a934f69d01460908
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 2323,
+          "logprob": -9.5625,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -10.375,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 369,
+          "logprob": -2.15625,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 279,
+          "logprob": -2.703125,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 220,
+          "logprob": -3.640625,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 679,
+          "logprob": -1.703125,
+          "special": false,
+          "text": "201"
+        },
+        {
+          "id": 24,
+          "logprob": -1.421875,
+          "special": false,
+          "text": "9"
+        },
+        {
+          "id": 12,
+          "logprob": -2.03125,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2366,
+          "logprob": -0.49023438,
+          "special": false,
+          "text": "202"
+        },
+        {
+          "id": 15,
+          "logprob": -0.041503906,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 2978,
+          "logprob": -0.87109375,
+          "special": false,
+          "text": " school"
+        },
+        {
+          "id": 1060,
+          "logprob": -0.012939453,
+          "special": false,
+          "text": " year"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the 2019-2020 school year"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 2323,
+          "logprob": -9.5625,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -10.375,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 369,
+          "logprob": -2.15625,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 279,
+          "logprob": -2.703125,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 220,
+          "logprob": -3.640625,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 679,
+          "logprob": -1.703125,
+          "special": false,
+          "text": "201"
+        },
+        {
+          "id": 24,
+          "logprob": -1.421875,
+          "special": false,
+          "text": "9"
+        },
+        {
+          "id": 12,
+          "logprob": -2.03125,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2366,
+          "logprob": -0.49023438,
+          "special": false,
+          "text": "202"
+        },
+        {
+          "id": 15,
+          "logprob": -0.041503906,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 2978,
+          "logprob": -0.87109375,
+          "special": false,
+          "text": " school"
+        },
+        {
+          "id": 1060,
+          "logprob": -0.012939453,
+          "special": false,
+          "text": " year"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the 2019-2020 school year"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 2323,
+          "logprob": -9.5625,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -10.375,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 369,
+          "logprob": -2.15625,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 279,
+          "logprob": -2.703125,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 220,
+          "logprob": -3.640625,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 679,
+          "logprob": -1.703125,
+          "special": false,
+          "text": "201"
+        },
+        {
+          "id": 24,
+          "logprob": -1.421875,
+          "special": false,
+          "text": "9"
+        },
+        {
+          "id": 12,
+          "logprob": -2.03125,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2366,
+          "logprob": -0.49023438,
+          "special": false,
+          "text": "202"
+        },
+        {
+          "id": 15,
+          "logprob": -0.041503906,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 2978,
+          "logprob": -0.87109375,
+          "special": false,
+          "text": " school"
+        },
+        {
+          "id": 1060,
+          "logprob": -0.012939453,
+          "special": false,
+          "text": " year"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the 2019-2020 school year"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 2323,
+          "logprob": -9.5625,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -10.375,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 369,
+          "logprob": -2.15625,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 279,
+          "logprob": -2.703125,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 220,
+          "logprob": -3.640625,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 679,
+          "logprob": -1.703125,
+          "special": false,
+          "text": "201"
+        },
+        {
+          "id": 24,
+          "logprob": -1.421875,
+          "special": false,
+          "text": "9"
+        },
+        {
+          "id": 12,
+          "logprob": -2.03125,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2366,
+          "logprob": -0.49023438,
+          "special": false,
+          "text": "202"
+        },
+        {
+          "id": 15,
+          "logprob": -0.041503906,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 2978,
+          "logprob": -0.87109375,
+          "special": false,
+          "text": " school"
+        },
+        {
+          "id": 1060,
+          "logprob": -0.012939453,
+          "special": false,
+          "text": " year"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the 2019-2020 school year"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json
new file mode 100644
index 0000000000000000000000000000000000000000..b82882c00b689f66a2b0193c80b85e91037ef72b
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json
@@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -6.1875,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.93359375,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -9.875,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -1.1796875,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -1.75,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 18682,
+        "logprob": -1.109375,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.005432129,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -0.028808594,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.013671875,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 27084,
+        "logprob": -0.69921875,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 315,
+        "logprob": -0.0005874634,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 5780,
+        "logprob": -0.026855469,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.00020885468,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 430,
+        "logprob": -0.17773438,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 18065,
+        "logprob": -0.703125,
+        "special": false,
+        "text": " involves"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " Deep learning is a subset of machine learning that involves"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..f195f8f73d87f54690c36fd313c6d4cbc12ed0d9
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 374,
+        "logprob": -18.0,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -11.8359375,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -2.0703125,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -5.9765625,
+        "text": "?"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 720,
+        "logprob": 0.0,
+        "special": false,
+        "text": " \n"
+      },
+      {
+        "id": 34564,
+        "logprob": -0.12512207,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 320,
+        "logprob": -0.23840332,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 16931,
+        "logprob": -2.0175781,
+        "special": false,
+        "text": "DL"
+      },
+      {
+        "id": 8,
+        "logprob": 0.0,
+        "special": false,
+        "text": ")"
+      },
+      {
+        "id": 374,
+        "logprob": -0.8613281,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1207,
+        "logprob": -1.2451172,
+        "special": false,
+        "text": " sub"
+      },
+      {
+        "id": 2630,
+        "logprob": 0.0,
+        "special": false,
+        "text": "field"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning? \nDeep learning (DL) is a subfield"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7acee467c621d1359ea96f373eacd23b82f8bc8
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json
@@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -6.1875,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.93359375,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -9.875,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.1796875,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -1.75,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -1.109375,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0047912598,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.025512695,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.012145996,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 27084,
+          "logprob": -0.72265625,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.0005760193,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.02722168,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.00023651123,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 430,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 18065,
+          "logprob": -0.703125,
+          "special": false,
+          "text": " involves"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a subset of machine learning that involves"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -6.21875,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.95703125,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -9.9375,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.1328125,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -1.75,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -1.1796875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.005432129,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.02758789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.013366699,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 27084,
+          "logprob": -0.6953125,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.0004863739,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.02709961,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.00022506714,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 430,
+          "logprob": -0.19726562,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 18065,
+          "logprob": -0.77734375,
+          "special": false,
+          "text": " involves"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a subset of machine learning that involves"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -6.21875,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.95703125,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -9.9375,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.1328125,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -1.75,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -1.1796875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.005432129,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.02758789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.013366699,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 27084,
+          "logprob": -0.6953125,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.0004863739,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.02709961,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.00022506714,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 430,
+          "logprob": -0.19726562,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 18065,
+          "logprob": -0.77734375,
+          "special": false,
+          "text": " involves"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a subset of machine learning that involves"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -6.21875,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.95703125,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -9.9375,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.1328125,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -1.75,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -1.1796875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.005432129,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.02758789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.013366699,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 27084,
+          "logprob": -0.6953125,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.0004863739,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.02709961,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.00022506714,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 430,
+          "logprob": -0.19726562,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 18065,
+          "logprob": -0.77734375,
+          "special": false,
+          "text": " involves"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a subset of machine learning that involves"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
index 7797cc6c4b92df2f57e5a048c36cfddc9eaf878a..0f99d2597e5985fe01bced9b53c378f1fb166f8f 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
@@ -5,85 +5,80 @@
     "generated_tokens": 10,
     "prefill": [
       {
-        "id": 1,
+        "id": 2323,
         "logprob": null,
-        "text": "<s>"
-      },
-      {
-        "id": 4321,
-        "logprob": -9.7890625,
         "text": "Test"
       },
       {
-        "id": 2009,
-        "logprob": -9.625,
-        "text": "request"
+        "id": 1715,
+        "logprob": -11.34375,
+        "text": " request"
       }
     ],
     "seed": null,
     "tokens": [
       {
-        "id": 13,
-        "logprob": -2.3359375,
+        "id": 198,
+        "logprob": -2.5742188,
         "special": false,
         "text": "\n"
       },
       {
-        "id": 3057,
-        "logprob": -1.8779297,
+        "id": 262,
+        "logprob": -1.6230469,
         "special": false,
-        "text": "Test"
+        "text": "   "
       },
       {
-        "id": 2009,
-        "logprob": -1.2744141,
+        "id": 3270,
+        "logprob": -2.046875,
         "special": false,
-        "text": " request"
+        "text": " \"\"\"\n"
       },
       {
-        "id": 13,
-        "logprob": -1.6933594,
+        "id": 262,
+        "logprob": -0.015281677,
         "special": false,
-        "text": "\n"
+        "text": "   "
       },
       {
-        "id": 3057,
-        "logprob": -1.4648438,
+        "id": 422,
+        "logprob": -2.1425781,
         "special": false,
-        "text": "Test"
+        "text": " if"
       },
       {
-        "id": 2009,
-        "logprob": -0.15600586,
+        "id": 1715,
+        "logprob": -0.9238281,
         "special": false,
         "text": " request"
       },
       {
-        "id": 13,
-        "logprob": -0.8027344,
+        "id": 13204,
+        "logprob": -0.076660156,
         "special": false,
-        "text": "\n"
+        "text": ".method"
       },
       {
-        "id": 3057,
-        "logprob": -0.23022461,
+        "id": 624,
+        "logprob": -0.021987915,
         "special": false,
-        "text": "Test"
+        "text": " =="
       },
       {
-        "id": 2009,
-        "logprob": -0.0069885254,
+        "id": 364,
+        "logprob": -0.39208984,
         "special": false,
-        "text": " request"
+        "text": " '"
       },
       {
-        "id": 13,
-        "logprob": -0.02218628,
+        "id": 3019,
+        "logprob": -0.10821533,
         "special": false,
-        "text": "\n"
+        "text": "POST"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "\nTest request\nTest request\nTest request\n"
+  "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
index fa2fd4a247f9bbc269885b7de2abbee473d1b4c2..4152b5b308b54e23f41fbc7894216fd96269bbea 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
@@ -5,85 +5,80 @@
     "generated_tokens": 10,
     "prefill": [
       {
-        "id": 1,
+        "id": 2323,
         "logprob": null,
-        "text": "<s>"
-      },
-      {
-        "id": 4321,
-        "logprob": -9.84375,
         "text": "Test"
       },
       {
-        "id": 2009,
-        "logprob": -9.6015625,
-        "text": "request"
+        "id": 1715,
+        "logprob": -11.34375,
+        "text": " request"
       }
     ],
     "seed": 0,
     "tokens": [
       {
-        "id": 29899,
-        "logprob": -1.5625,
+        "id": 13,
+        "logprob": -2.2539062,
         "special": false,
-        "text": "-"
+        "text": "."
       },
       {
-        "id": 1454,
-        "logprob": -0.20410156,
+        "id": 578,
+        "logprob": -0.15563965,
         "special": false,
-        "text": "for"
+        "text": " The"
       },
       {
-        "id": 29899,
-        "logprob": 0.0,
+        "id": 3622,
+        "logprob": -0.8203125,
         "special": false,
-        "text": "-"
+        "text": " server"
       },
       {
-        "id": 9342,
+        "id": 706,
         "logprob": 0.0,
         "special": false,
-        "text": "comment"
+        "text": " has"
       },
       {
-        "id": 29901,
+        "id": 539,
         "logprob": 0.0,
         "special": false,
-        "text": ":"
+        "text": " not"
       },
       {
-        "id": 396,
-        "logprob": -0.27685547,
+        "id": 3686,
+        "logprob": 0.0,
         "special": false,
-        "text": " #"
+        "text": " yet"
       },
       {
-        "id": 29906,
-        "logprob": -0.4970703,
+        "id": 3288,
+        "logprob": 0.0,
         "special": false,
-        "text": "2"
+        "text": " sent"
       },
       {
-        "id": 29900,
-        "logprob": -0.80615234,
+        "id": 904,
+        "logprob": 0.0,
         "special": false,
-        "text": "0"
+        "text": " any"
       },
       {
-        "id": 29896,
+        "id": 828,
         "logprob": 0.0,
         "special": false,
-        "text": "1"
+        "text": " data"
       },
       {
-        "id": 29955,
-        "logprob": -1.0751953,
+        "id": 382,
+        "logprob": -1.5517578,
         "special": false,
-        "text": "7"
+        "text": ".\n\n"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Test request-for-comment: #2017"
+  "generated_text": "Test request. The server has not yet sent any data.\n\n"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
index 594b73518867dcfd3206fb25f6f9178afbd63802..75e903033c4270c9ce106dc22269544b4c4a127b 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
@@ -6,87 +6,82 @@
       "generated_tokens": 10,
       "prefill": [
         {
-          "id": 1,
+          "id": 2323,
           "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -9.828125,
           "text": "Test"
         },
         {
-          "id": 2009,
-          "logprob": -9.609375,
-          "text": "request"
+          "id": 1715,
+          "logprob": -11.34375,
+          "text": " request"
         }
       ],
       "seed": null,
       "tokens": [
         {
-          "id": 13,
-          "logprob": -2.3300781,
+          "id": 198,
+          "logprob": -2.5742188,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 3057,
-          "logprob": -1.8740234,
+          "id": 262,
+          "logprob": -1.6220703,
           "special": false,
-          "text": "Test"
+          "text": "   "
         },
         {
-          "id": 2009,
-          "logprob": -1.2646484,
+          "id": 3270,
+          "logprob": -2.0410156,
           "special": false,
-          "text": " request"
+          "text": " \"\"\"\n"
         },
         {
-          "id": 13,
-          "logprob": -1.7158203,
+          "id": 262,
+          "logprob": -0.015281677,
           "special": false,
-          "text": "\n"
+          "text": "   "
         },
         {
-          "id": 3057,
-          "logprob": -1.4667969,
+          "id": 422,
+          "logprob": -2.1445312,
           "special": false,
-          "text": "Test"
+          "text": " if"
         },
         {
-          "id": 2009,
-          "logprob": -0.15344238,
+          "id": 1715,
+          "logprob": -0.92333984,
           "special": false,
           "text": " request"
         },
         {
-          "id": 13,
-          "logprob": -0.81591797,
+          "id": 13204,
+          "logprob": -0.07672119,
           "special": false,
-          "text": "\n"
+          "text": ".method"
         },
         {
-          "id": 3057,
-          "logprob": -0.22973633,
+          "id": 624,
+          "logprob": -0.021987915,
           "special": false,
-          "text": "Test"
+          "text": " =="
         },
         {
-          "id": 2009,
-          "logprob": -0.007045746,
+          "id": 364,
+          "logprob": -0.39208984,
           "special": false,
-          "text": " request"
+          "text": " '"
         },
         {
-          "id": 13,
-          "logprob": -0.021957397,
+          "id": 3019,
+          "logprob": -0.10638428,
           "special": false,
-          "text": "\n"
+          "text": "POST"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\nTest request\nTest request\nTest request\n"
+    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
   },
   {
     "details": {
@@ -95,87 +90,82 @@
       "generated_tokens": 10,
       "prefill": [
         {
-          "id": 1,
+          "id": 2323,
           "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -9.84375,
           "text": "Test"
         },
         {
-          "id": 2009,
-          "logprob": -9.59375,
-          "text": "request"
+          "id": 1715,
+          "logprob": -11.34375,
+          "text": " request"
         }
       ],
       "seed": null,
       "tokens": [
         {
-          "id": 13,
-          "logprob": -2.3378906,
+          "id": 198,
+          "logprob": -2.5742188,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 3057,
-          "logprob": -1.8779297,
+          "id": 262,
+          "logprob": -1.6220703,
           "special": false,
-          "text": "Test"
+          "text": "   "
         },
         {
-          "id": 2009,
-          "logprob": -1.2636719,
+          "id": 3270,
+          "logprob": -2.0410156,
           "special": false,
-          "text": " request"
+          "text": " \"\"\"\n"
         },
         {
-          "id": 13,
-          "logprob": -1.6992188,
+          "id": 262,
+          "logprob": -0.015281677,
           "special": false,
-          "text": "\n"
+          "text": "   "
         },
         {
-          "id": 3057,
-          "logprob": -1.4589844,
+          "id": 422,
+          "logprob": -2.1445312,
           "special": false,
-          "text": "Test"
+          "text": " if"
         },
         {
-          "id": 2009,
-          "logprob": -0.15344238,
+          "id": 1715,
+          "logprob": -0.92333984,
           "special": false,
           "text": " request"
         },
         {
-          "id": 13,
-          "logprob": -0.79052734,
+          "id": 13204,
+          "logprob": -0.07672119,
           "special": false,
-          "text": "\n"
+          "text": ".method"
         },
         {
-          "id": 3057,
-          "logprob": -0.22937012,
+          "id": 624,
+          "logprob": -0.021987915,
           "special": false,
-          "text": "Test"
+          "text": " =="
         },
         {
-          "id": 2009,
-          "logprob": -0.007041931,
+          "id": 364,
+          "logprob": -0.39208984,
           "special": false,
-          "text": " request"
+          "text": " '"
         },
         {
-          "id": 13,
-          "logprob": -0.022140503,
+          "id": 3019,
+          "logprob": -0.10638428,
           "special": false,
-          "text": "\n"
+          "text": "POST"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\nTest request\nTest request\nTest request\n"
+    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
   },
   {
     "details": {
@@ -184,87 +174,82 @@
       "generated_tokens": 10,
       "prefill": [
         {
-          "id": 1,
+          "id": 2323,
           "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -9.84375,
           "text": "Test"
         },
         {
-          "id": 2009,
-          "logprob": -9.609375,
-          "text": "request"
+          "id": 1715,
+          "logprob": -11.34375,
+          "text": " request"
         }
       ],
       "seed": null,
       "tokens": [
         {
-          "id": 13,
-          "logprob": -2.3261719,
+          "id": 198,
+          "logprob": -2.5742188,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 3057,
-          "logprob": -1.8730469,
+          "id": 262,
+          "logprob": -1.6220703,
           "special": false,
-          "text": "Test"
+          "text": "   "
         },
         {
-          "id": 2009,
-          "logprob": -1.2587891,
+          "id": 3270,
+          "logprob": -2.0410156,
           "special": false,
-          "text": " request"
+          "text": " \"\"\"\n"
         },
         {
-          "id": 13,
-          "logprob": -1.6894531,
+          "id": 262,
+          "logprob": -0.015281677,
           "special": false,
-          "text": "\n"
+          "text": "   "
         },
         {
-          "id": 3057,
-          "logprob": -1.46875,
+          "id": 422,
+          "logprob": -2.1445312,
           "special": false,
-          "text": "Test"
+          "text": " if"
         },
         {
-          "id": 2009,
-          "logprob": -0.1541748,
+          "id": 1715,
+          "logprob": -0.92333984,
           "special": false,
           "text": " request"
         },
         {
-          "id": 13,
-          "logprob": -0.80322266,
+          "id": 13204,
+          "logprob": -0.07672119,
           "special": false,
-          "text": "\n"
+          "text": ".method"
         },
         {
-          "id": 3057,
-          "logprob": -0.22912598,
+          "id": 624,
+          "logprob": -0.021987915,
           "special": false,
-          "text": "Test"
+          "text": " =="
         },
         {
-          "id": 2009,
-          "logprob": -0.0070495605,
+          "id": 364,
+          "logprob": -0.39208984,
           "special": false,
-          "text": " request"
+          "text": " '"
         },
         {
-          "id": 13,
-          "logprob": -0.021606445,
+          "id": 3019,
+          "logprob": -0.10638428,
           "special": false,
-          "text": "\n"
+          "text": "POST"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\nTest request\nTest request\nTest request\n"
+    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
   },
   {
     "details": {
@@ -273,86 +258,81 @@
       "generated_tokens": 10,
       "prefill": [
         {
-          "id": 1,
+          "id": 2323,
           "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -9.84375,
           "text": "Test"
         },
         {
-          "id": 2009,
-          "logprob": -9.6015625,
-          "text": "request"
+          "id": 1715,
+          "logprob": -11.34375,
+          "text": " request"
         }
       ],
       "seed": null,
       "tokens": [
         {
-          "id": 13,
-          "logprob": -2.3320312,
+          "id": 198,
+          "logprob": -2.5742188,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 3057,
-          "logprob": -1.875,
+          "id": 262,
+          "logprob": -1.6220703,
           "special": false,
-          "text": "Test"
+          "text": "   "
         },
         {
-          "id": 2009,
-          "logprob": -1.2646484,
+          "id": 3270,
+          "logprob": -2.0410156,
           "special": false,
-          "text": " request"
+          "text": " \"\"\"\n"
         },
         {
-          "id": 13,
-          "logprob": -1.6884766,
+          "id": 262,
+          "logprob": -0.015281677,
           "special": false,
-          "text": "\n"
+          "text": "   "
         },
         {
-          "id": 3057,
-          "logprob": -1.4589844,
+          "id": 422,
+          "logprob": -2.1445312,
           "special": false,
-          "text": "Test"
+          "text": " if"
         },
         {
-          "id": 2009,
-          "logprob": -0.15185547,
+          "id": 1715,
+          "logprob": -0.92333984,
           "special": false,
           "text": " request"
         },
         {
-          "id": 13,
-          "logprob": -0.79833984,
+          "id": 13204,
+          "logprob": -0.07672119,
           "special": false,
-          "text": "\n"
+          "text": ".method"
         },
         {
-          "id": 3057,
-          "logprob": -0.22827148,
+          "id": 624,
+          "logprob": -0.021987915,
           "special": false,
-          "text": "Test"
+          "text": " =="
         },
         {
-          "id": 2009,
-          "logprob": -0.006996155,
+          "id": 364,
+          "logprob": -0.39208984,
           "special": false,
-          "text": " request"
+          "text": " '"
         },
         {
-          "id": 13,
-          "logprob": -0.021560669,
+          "id": 3019,
+          "logprob": -0.10638428,
           "special": false,
-          "text": "\n"
+          "text": "POST"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\nTest request\nTest request\nTest request\n"
+    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin.json
new file mode 100644
index 0000000000000000000000000000000000000000..94883de5fc6009fe855ab94792428054584df9e2
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.0859375,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -16.359375,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 5229,
+        "logprob": -2.7988281,
+        "special": false,
+        "text": " failed"
+      },
+      {
+        "id": 29901,
+        "logprob": -0.91259766,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 853,
+        "logprob": -2.8496094,
+        "special": false,
+        "text": " Un"
+      },
+      {
+        "id": 23765,
+        "logprob": -1.1894531,
+        "special": false,
+        "text": "supported"
+      },
+      {
+        "id": 4714,
+        "logprob": -1.5917969,
+        "special": false,
+        "text": " browser"
+      },
+      {
+        "id": 29892,
+        "logprob": -0.34765625,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1873,
+        "logprob": -1.2695312,
+        "special": false,
+        "text": " version"
+      },
+      {
+        "id": 470,
+        "logprob": -0.25170898,
+        "special": false,
+        "text": " or"
+      },
+      {
+        "id": 7481,
+        "logprob": -0.21411133,
+        "special": false,
+        "text": " platform"
+      },
+      {
+        "id": 13,
+        "logprob": -1.1162109,
+        "special": false,
+        "text": "\n"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " failed: Unsupported browser, version or platform\n"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..58cacb8029870102d8e6ddc26252f03859b4073e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.0859375,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -16.359375,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5229,
+        "logprob": -0.6645508,
+        "special": false,
+        "text": " failed"
+      },
+      {
+        "id": 29901,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 6527,
+        "logprob": -2.2324219,
+        "special": false,
+        "text": " Could"
+      },
+      {
+        "id": 451,
+        "logprob": 0.0,
+        "special": false,
+        "text": " not"
+      },
+      {
+        "id": 6088,
+        "logprob": -1.6074219,
+        "special": false,
+        "text": " parse"
+      },
+      {
+        "id": 1243,
+        "logprob": -1.6298828,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 1206,
+        "logprob": -0.72558594,
+        "special": false,
+        "text": " case"
+      },
+      {
+        "id": 1024,
+        "logprob": -0.40429688,
+        "special": false,
+        "text": " name"
+      },
+      {
+        "id": 515,
+        "logprob": 0.0,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 525,
+        "logprob": -1.2519531,
+        "special": false,
+        "text": " '"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request failed: Could not parse test case name from '"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..96a40fa42eda3b7acb3410e1cfa8126227abcb45
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.0859375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -16.359375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 5229,
+          "logprob": -2.7988281,
+          "special": false,
+          "text": " failed"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 853,
+          "logprob": -2.8496094,
+          "special": false,
+          "text": " Un"
+        },
+        {
+          "id": 23765,
+          "logprob": -1.1894531,
+          "special": false,
+          "text": "supported"
+        },
+        {
+          "id": 4714,
+          "logprob": -1.5917969,
+          "special": false,
+          "text": " browser"
+        },
+        {
+          "id": 29892,
+          "logprob": -0.34765625,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 1873,
+          "logprob": -1.2695312,
+          "special": false,
+          "text": " version"
+        },
+        {
+          "id": 470,
+          "logprob": -0.25170898,
+          "special": false,
+          "text": " or"
+        },
+        {
+          "id": 7481,
+          "logprob": -0.21411133,
+          "special": false,
+          "text": " platform"
+        },
+        {
+          "id": 13,
+          "logprob": -1.1162109,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " failed: Unsupported browser, version or platform\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.0859375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -16.359375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 5229,
+          "logprob": -2.7988281,
+          "special": false,
+          "text": " failed"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 853,
+          "logprob": -2.8496094,
+          "special": false,
+          "text": " Un"
+        },
+        {
+          "id": 23765,
+          "logprob": -1.1894531,
+          "special": false,
+          "text": "supported"
+        },
+        {
+          "id": 4714,
+          "logprob": -1.5917969,
+          "special": false,
+          "text": " browser"
+        },
+        {
+          "id": 29892,
+          "logprob": -0.34765625,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 1873,
+          "logprob": -1.2695312,
+          "special": false,
+          "text": " version"
+        },
+        {
+          "id": 470,
+          "logprob": -0.25170898,
+          "special": false,
+          "text": " or"
+        },
+        {
+          "id": 7481,
+          "logprob": -0.21411133,
+          "special": false,
+          "text": " platform"
+        },
+        {
+          "id": 13,
+          "logprob": -1.1162109,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " failed: Unsupported browser, version or platform\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.0859375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -16.359375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 5229,
+          "logprob": -2.7988281,
+          "special": false,
+          "text": " failed"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 853,
+          "logprob": -2.8496094,
+          "special": false,
+          "text": " Un"
+        },
+        {
+          "id": 23765,
+          "logprob": -1.1894531,
+          "special": false,
+          "text": "supported"
+        },
+        {
+          "id": 4714,
+          "logprob": -1.5917969,
+          "special": false,
+          "text": " browser"
+        },
+        {
+          "id": 29892,
+          "logprob": -0.34765625,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 1873,
+          "logprob": -1.2695312,
+          "special": false,
+          "text": " version"
+        },
+        {
+          "id": 470,
+          "logprob": -0.25170898,
+          "special": false,
+          "text": " or"
+        },
+        {
+          "id": 7481,
+          "logprob": -0.21411133,
+          "special": false,
+          "text": " platform"
+        },
+        {
+          "id": 13,
+          "logprob": -1.1162109,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " failed: Unsupported browser, version or platform\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.0859375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -16.359375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 5229,
+          "logprob": -2.7988281,
+          "special": false,
+          "text": " failed"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 853,
+          "logprob": -2.8496094,
+          "special": false,
+          "text": " Un"
+        },
+        {
+          "id": 23765,
+          "logprob": -1.1894531,
+          "special": false,
+          "text": "supported"
+        },
+        {
+          "id": 4714,
+          "logprob": -1.5917969,
+          "special": false,
+          "text": " browser"
+        },
+        {
+          "id": 29892,
+          "logprob": -0.34765625,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 1873,
+          "logprob": -1.2695312,
+          "special": false,
+          "text": " version"
+        },
+        {
+          "id": 470,
+          "logprob": -0.25170898,
+          "special": false,
+          "text": " or"
+        },
+        {
+          "id": 7481,
+          "logprob": -0.21411133,
+          "special": false,
+          "text": " platform"
+        },
+        {
+          "id": 13,
+          "logprob": -1.1162109,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " failed: Unsupported browser, version or platform\n"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_prefix/test_flash_llama_load.json b/integration-tests/models/__snapshots__/test_flash_llama_prefix/test_flash_llama_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7c6177b748a17eae009f761bbf70427f7ed521d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_prefix/test_flash_llama_load.json
@@ -0,0 +1,2550 @@
+[
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Jeff Walker's Product Launch Formula is a comprehensive system",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 69,
+      "total_tokens": 79
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are three key indicators to determine if a customer",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 52,
+      "total_tokens": 62
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You can use the `String.format()` method in",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 97,
+      "total_tokens": 107
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a realm of binary mysticism, we find",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 126,
+      "total_tokens": 136
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The `dummy` variable is being used to consume",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 305,
+      "total_tokens": 315
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You can add multiple new columns in Power Query (",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 51,
+      "total_tokens": 61
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "There are many exciting new technologies emerging across various fields",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 52,
+      "total_tokens": 62
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Poly Ether Ether Ketone (PEEK) is",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 40,
+      "total_tokens": 50
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a technical overview of a referral system similar",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 85,
+      "total_tokens": 95
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's an example of how you can add an",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 45,
+      "total_tokens": 55
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to help with Java. What",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 43,
+      "total_tokens": 53
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I can help you plan a road trip from Pune",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 82,
+      "total_tokens": 92
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to explain more about a topic",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 38,
+      "total_tokens": 48
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to help you brainstorm and provide",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 47,
+      "total_tokens": 57
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Implementing a Minesweeper algorithm using algebraic",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 54,
+      "total_tokens": 64
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "There are several issues with the provided code:\n\n1",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 375,
+      "total_tokens": 385
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": ";)",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085330,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 2,
+      "prompt_tokens": 105,
+      "total_tokens": 107
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "As I delved into the world of high-st",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2097,
+      "total_tokens": 2107
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "/u/CruxHub: Hi, I'm",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2614,
+      "total_tokens": 2624
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To simulate a conversation between Alice and /u/C",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1070,
+      "total_tokens": 1080
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Alice: Hey /u/CruxHub,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1847,
+      "total_tokens": 1857
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Alice: Hi /u/CruxHub,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1849,
+      "total_tokens": 1859
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "/u/CruxHub: Hey Alice, I",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1004,
+      "total_tokens": 1014
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "/u/CruxHub: Hey Alice, I",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1100,
+      "total_tokens": 1110
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "/u/CruxHub: Hey Alice, I",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1044,
+      "total_tokens": 1054
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The Dogme approach and the Lexical Approach are",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 54,
+      "total_tokens": 64
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Implementing a netfilter in Linux with a Rust",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 48,
+      "total_tokens": 58
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Damage to the Ulnar nerve can cause numb",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 56,
+      "total_tokens": 66
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The Space Shuttle's Reaction Control System (RCS",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I can provide you with a basic Python script that",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 65,
+      "total_tokens": 75
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Farming meat has several negative impacts on the environment",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 43,
+      "total_tokens": 53
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The photograph filter you're referring to is called \"",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 51,
+      "total_tokens": 61
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a sample geological database structure with some example",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 59,
+      "total_tokens": 69
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Web Marketing: A Simplified Explanation**\n\nWeb",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 45,
+      "total_tokens": 55
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a rewritten and improved version of the story",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 447,
+      "total_tokens": 457
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are the questions rewritten in a more conversational",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 168,
+      "total_tokens": 178
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Learning Progress: 0%**\n\n| Topic",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 216,
+      "total_tokens": 226
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I couldn't find any information on a person named",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 44,
+      "total_tokens": 54
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a list of the largest outdoor retailers in",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 43,
+      "total_tokens": 53
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To create a WordPress shortcode that includes Facebook SDK code",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 49,
+      "total_tokens": 59
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The sentence is mostly grammatically correct, but there",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 78,
+      "total_tokens": 88
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to engage in a debate with",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 59,
+      "total_tokens": 69
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd love to hear about your business. As",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 64,
+      "total_tokens": 74
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'll wait for your request to proceed with part",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2410,
+      "total_tokens": 2420
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The final part of the Day Sculpting program emphasizes",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2699,
+      "total_tokens": 2709
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Analysis of the Coming of Age Story Archetype",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 349,
+      "total_tokens": 359
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The Apostle John is one of the most prominent figures",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 49,
+      "total_tokens": 59
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To build a Google Places autocomplete feature on Jetpack",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 427,
+      "total_tokens": 437
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The information provided does not mention the captain's name",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 169,
+      "total_tokens": 179
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The metaverse is a shared, immersive and interactive",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 39,
+      "total_tokens": 49
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some ideas for a series of articles for",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "\"Purim Palooza Alert: \n\nTo",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 78,
+      "total_tokens": 88
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Summary of the paper in 10 points:",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2022,
+      "total_tokens": 2032
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You'll provide three pieces of text, and then",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 58,
+      "total_tokens": 68
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'm ready to proceed with text 3.",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1650,
+      "total_tokens": 1660
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'm ready to answer questions on Text 1",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1116,
+      "total_tokens": 1126
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "This is a Solidity contract written in the older",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 334,
+      "total_tokens": 344
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Speech Recognition and Synthesis using Python**\n\nTo",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 84,
+      "total_tokens": 94
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to help you discuss a paper",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 42,
+      "total_tokens": 52
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To handle the given utterance, we can use",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 375,
+      "total_tokens": 385
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Subscription Services Template:**\n\n**Title:** Virtual",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 443,
+      "total_tokens": 453
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Hello. How can I assist you today?",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 36,
+      "total_tokens": 46
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Differentiating yourself from other Etsy shops is crucial to",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 102,
+      "total_tokens": 112
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To become a Licensed Marriage and Family Therapist (",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 53,
+      "total_tokens": 63
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**What is Quantum Computing?**\n\nQuantum computing",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 42,
+      "total_tokens": 52
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Aquí te dejo 40 opciones de nombres",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 108,
+      "total_tokens": 118
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Deposition is a geological process that involves the transportation",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 38,
+      "total_tokens": 48
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some good e-governance initiatives in",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 55,
+      "total_tokens": 65
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a simple Python program that accepts a command",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 56,
+      "total_tokens": 66
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Imagine you're playing with a toy box. You",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 47,
+      "total_tokens": 57
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's an example of a question they might ask",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 66,
+      "total_tokens": 76
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Arduino Uno adalah sebuah papan mikrokontrol",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 38,
+      "total_tokens": 48
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To edit an array that is within an object,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 42,
+      "total_tokens": 52
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Microsoft ENTRA (Enterprise Mobility + Security) is",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 56,
+      "total_tokens": 66
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To calculate the difference in interest paid between a simple",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 69,
+      "total_tokens": 79
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Yes, you can use Spring State Machine and Spring",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 49,
+      "total_tokens": 59
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The issue lies in the fact that the `meta",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 142,
+      "total_tokens": 152
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some effective marketing tactics for local small businesses",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 46,
+      "total_tokens": 56
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The French Revolution, which lasted from 1789",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 41,
+      "total_tokens": 51
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Roles of a Network Driver:**\n\nA network",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 65,
+      "total_tokens": 75
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Yes, I'm familiar with the SAS (Stat",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 44,
+      "total_tokens": 54
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Using relays to control 12V solen",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 60,
+      "total_tokens": 70
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You can use the following Python code to achieve this",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 55,
+      "total_tokens": 65
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some prompts for viral comics:\n\n1.",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 336,
+      "total_tokens": 346
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To simplify and make the comic funnier, consider",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 301,
+      "total_tokens": 311
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a rewritten version of the 4-panel",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 282,
+      "total_tokens": 292
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Subject: Request for E-Waste Collection and Computer",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 110,
+      "total_tokens": 120
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In the context of conference calls, the state you",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 84,
+      "total_tokens": 94
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I can provide a general classification of companies based on",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 56,
+      "total_tokens": 66
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some user stories that describe the concept in",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 44,
+      "total_tokens": 54
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You can check your Python version by running the following",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 39,
+      "total_tokens": 49
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Scenario:**\n\n15-year-old Black youth,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 473,
+      "total_tokens": 483
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "As a Demand Generation Manager for a B2B",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The error is due to a typo in your code",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085336,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 369,
+      "total_tokens": 379
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "고등교육의 필요성에 관한 영어 에",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 72,
+      "total_tokens": 82
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a simple C# program that uses the",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 51,
+      "total_tokens": 61
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The error message \"connection refused\" indicates that the",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085331,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 85,
+      "total_tokens": 95
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To load an image, you can use various methods",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726085326,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 41,
+      "total_tokens": 51
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_prefix_flashdecoding/test_flash_llama_flashdecoding.json b/integration-tests/models/__snapshots__/test_flash_llama_prefix_flashdecoding/test_flash_llama_flashdecoding.json
new file mode 100644
index 0000000000000000000000000000000000000000..51247ba416878d3cba031303534ad2656d94f523
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_prefix_flashdecoding/test_flash_llama_flashdecoding.json
@@ -0,0 +1,2550 @@
+[
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Jeff Walker's Product Launch Formula is a comprehensive system",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 69,
+      "total_tokens": 79
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are three key indicators to determine if a customer",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 52,
+      "total_tokens": 62
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You can use the `String.format()` method in",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 97,
+      "total_tokens": 107
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a realm of binary mysticism, we find",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 126,
+      "total_tokens": 136
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The `dummy` variable is being used to consume",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 305,
+      "total_tokens": 315
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You can add multiple new columns in Power Query (",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 51,
+      "total_tokens": 61
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "There are many exciting new technologies emerging across various fields",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 52,
+      "total_tokens": 62
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Poly Ether Ether Ketone (PEEK) is",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 40,
+      "total_tokens": 50
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a technical overview of a referral system similar",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 85,
+      "total_tokens": 95
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's an example of how you can add an",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 45,
+      "total_tokens": 55
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to help with Java. What",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 43,
+      "total_tokens": 53
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I can help you plan a road trip from Pune",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 82,
+      "total_tokens": 92
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to explain more about a topic",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 38,
+      "total_tokens": 48
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to help you brainstorm and provide",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 47,
+      "total_tokens": 57
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Implementing a Minesweeper algorithm using algebraic",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 54,
+      "total_tokens": 64
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "There are several issues with the provided code:\n\n1",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 375,
+      "total_tokens": 385
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": ";)",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 2,
+      "prompt_tokens": 105,
+      "total_tokens": 107
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "As I delved into the world of high-st",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2097,
+      "total_tokens": 2107
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "/u/CruxHub: Hi, I'm",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2614,
+      "total_tokens": 2624
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To simulate a conversation between Alice and /u/C",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1070,
+      "total_tokens": 1080
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Alice: Hey /u/CruxHub,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1847,
+      "total_tokens": 1857
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Alice: Hi /u/CruxHub,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1849,
+      "total_tokens": 1859
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "/u/CruxHub: Hey Alice, I",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1004,
+      "total_tokens": 1014
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "/u/CruxHub: Hey Alice, I",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1100,
+      "total_tokens": 1110
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "/u/CruxHub: Hey Alice, I",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1044,
+      "total_tokens": 1054
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The Dogme approach and the Lexical Approach are",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 54,
+      "total_tokens": 64
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Implementing a netfilter in Linux with a Rust",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 48,
+      "total_tokens": 58
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Damage to the Ulnar nerve can cause numb",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 56,
+      "total_tokens": 66
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The Space Shuttle's Reaction Control System (RCS",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I can provide you with a basic Python script that",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 65,
+      "total_tokens": 75
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Farming meat has several negative impacts on the environment",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 43,
+      "total_tokens": 53
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The photograph filter you're referring to is called \"",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 51,
+      "total_tokens": 61
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a sample geological database structure with some example",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 59,
+      "total_tokens": 69
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Web Marketing: A Simplified Explanation**\n\nWeb",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 45,
+      "total_tokens": 55
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a rewritten and improved version of the story",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 447,
+      "total_tokens": 457
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are the questions rewritten in a more conversational",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 168,
+      "total_tokens": 178
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Learning Progress: 0%**\n\n| Topic",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 216,
+      "total_tokens": 226
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I couldn't find any information on a person named",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 44,
+      "total_tokens": 54
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a list of the largest outdoor retailers in",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 43,
+      "total_tokens": 53
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To create a WordPress shortcode that includes Facebook SDK code",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 49,
+      "total_tokens": 59
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The sentence is mostly grammatically correct, but there",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 78,
+      "total_tokens": 88
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to engage in a debate with",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 59,
+      "total_tokens": 69
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd love to hear about your business. As",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 64,
+      "total_tokens": 74
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'll wait for your request to proceed with part",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2410,
+      "total_tokens": 2420
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The final part of the Day Sculpting program emphasizes",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2699,
+      "total_tokens": 2709
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Analysis of the Coming of Age Story Archetype",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 349,
+      "total_tokens": 359
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The Apostle John is one of the most prominent figures",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 49,
+      "total_tokens": 59
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To build a Google Places autocomplete feature on Jetpack",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 427,
+      "total_tokens": 437
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The information provided does not mention the captain's name",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 169,
+      "total_tokens": 179
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The metaverse is a shared, immersive and interactive",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 39,
+      "total_tokens": 49
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some ideas for a series of articles for",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "\"Purim Palooza Alert: \n\nTo",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 78,
+      "total_tokens": 88
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Summary of the paper in 10 points:",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 2022,
+      "total_tokens": 2032
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You'll provide three pieces of text, and then",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 58,
+      "total_tokens": 68
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'm ready to proceed with text 3.",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1650,
+      "total_tokens": 1660
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'm ready to answer questions on Text 1",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 1116,
+      "total_tokens": 1126
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "This is a Solidity contract written in the older",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 334,
+      "total_tokens": 344
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Speech Recognition and Synthesis using Python**\n\nTo",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 84,
+      "total_tokens": 94
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I'd be happy to help you discuss a paper",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 42,
+      "total_tokens": 52
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To handle the given utterance, we can use",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 375,
+      "total_tokens": 385
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Subscription Services Template:**\n\n**Title:** Virtual",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 443,
+      "total_tokens": 453
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Hello. How can I assist you today?",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 36,
+      "total_tokens": 46
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Differentiating yourself from other Etsy shops is crucial to",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 102,
+      "total_tokens": 112
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To become a Licensed Marriage and Family Therapist (",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 53,
+      "total_tokens": 63
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**What is Quantum Computing?**\n\nQuantum computing",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 42,
+      "total_tokens": 52
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Aquí te dejo 40 opciones de nombres",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 108,
+      "total_tokens": 118
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Deposition is a geological process that involves the transportation",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 38,
+      "total_tokens": 48
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some good e-governance initiatives in",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 55,
+      "total_tokens": 65
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a simple Python program that accepts a command",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 56,
+      "total_tokens": 66
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Imagine you're playing with a toy box. You",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 47,
+      "total_tokens": 57
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's an example of a question they might ask",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 66,
+      "total_tokens": 76
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Arduino Uno adalah sebuah papan mikrokontrol",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 38,
+      "total_tokens": 48
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To edit an array that is within an object,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 42,
+      "total_tokens": 52
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Microsoft ENTRA (Enterprise Mobility + Security) is",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 56,
+      "total_tokens": 66
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To calculate the difference in interest paid between a simple",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 69,
+      "total_tokens": 79
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Yes, you can use Spring State Machine and Spring",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 49,
+      "total_tokens": 59
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The issue lies in the fact that the `meta",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 142,
+      "total_tokens": 152
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some effective marketing tactics for local small businesses",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 46,
+      "total_tokens": 56
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The French Revolution, which lasted from 1789",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 41,
+      "total_tokens": 51
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Roles of a Network Driver:**\n\nA network",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 65,
+      "total_tokens": 75
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Yes, I'm familiar with the SAS (Stat",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 44,
+      "total_tokens": 54
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Using relays to control 12V solen",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 60,
+      "total_tokens": 70
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You can use the following Python code to achieve this",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 55,
+      "total_tokens": 65
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some prompts for viral comics:\n\n1.",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 336,
+      "total_tokens": 346
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To simplify and make the comic funnier, consider",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 301,
+      "total_tokens": 311
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a rewritten version of the 4-panel",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243278,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 282,
+      "total_tokens": 292
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Subject: Request for E-Waste Collection and Computer",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 110,
+      "total_tokens": 120
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In the context of conference calls, the state you",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 84,
+      "total_tokens": 94
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "I can provide a general classification of companies based on",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 56,
+      "total_tokens": 66
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here are some user stories that describe the concept in",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 44,
+      "total_tokens": 54
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "You can check your Python version by running the following",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 39,
+      "total_tokens": 49
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "**Scenario:**\n\n15-year-old Black youth,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 473,
+      "total_tokens": 483
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "As a Demand Generation Manager for a B2B",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The error is due to a typo in your code",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 369,
+      "total_tokens": 379
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "고등교육의 필요성에 관한 영어 에",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243286,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 72,
+      "total_tokens": 82
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "Here's a simple C# program that uses the",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243283,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 51,
+      "total_tokens": 61
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "The error message \"connection refused\" indicates that the",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243277,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 85,
+      "total_tokens": 95
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "To load an image, you can use various methods",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1726243284,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 41,
+      "total_tokens": 51
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral.json b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral.json
new file mode 100644
index 0000000000000000000000000000000000000000..56419967b3d36d169180b6ea889e54b867c9fa6e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral.json
@@ -0,0 +1,114 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1824,
+        "logprob": -6.1445312,
+        "text": "What"
+      },
+      {
+        "id": 349,
+        "logprob": -1.4648438,
+        "text": "is"
+      },
+      {
+        "id": 21135,
+        "logprob": -13.6875,
+        "text": "gradient"
+      },
+      {
+        "id": 24871,
+        "logprob": -1.6005859,
+        "text": "descent"
+      },
+      {
+        "id": 28804,
+        "logprob": -0.39526367,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -0.640625,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.18774414,
+        "text": "\n"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 20910,
+        "logprob": -0.96484375,
+        "special": false,
+        "text": "Grad"
+      },
+      {
+        "id": 722,
+        "logprob": -0.003168106,
+        "special": false,
+        "text": "ient"
+      },
+      {
+        "id": 24871,
+        "logprob": -0.16540527,
+        "special": false,
+        "text": " descent"
+      },
+      {
+        "id": 349,
+        "logprob": -0.08886719,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 396,
+        "logprob": -0.75878906,
+        "special": false,
+        "text": " an"
+      },
+      {
+        "id": 18586,
+        "logprob": -0.5703125,
+        "special": false,
+        "text": " optimization"
+      },
+      {
+        "id": 9464,
+        "logprob": -0.11242676,
+        "special": false,
+        "text": " algorithm"
+      },
+      {
+        "id": 1307,
+        "logprob": -0.7939453,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 298,
+        "logprob": -0.17102051,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 26518,
+        "logprob": -0.34326172,
+        "special": false,
+        "text": " minimize"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Gradient descent is an optimization algorithm used to minimize"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..e25086c61a157bc4d70d767e5f47aefd243dcf1f
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 24871,
+        "logprob": -17.234375,
+        "text": "descent"
+      },
+      {
+        "id": 28804,
+        "logprob": -7.4375,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -0.8046875,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.33032227,
+        "text": "\n"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 1313,
+        "logprob": -2.3613281,
+        "special": false,
+        "text": "It"
+      },
+      {
+        "id": 3969,
+        "logprob": -0.7285156,
+        "special": false,
+        "text": " seems"
+      },
+      {
+        "id": 298,
+        "logprob": -1.3466797,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 528,
+        "logprob": 0.0,
+        "special": false,
+        "text": " me"
+      },
+      {
+        "id": 28725,
+        "logprob": -1.6757812,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 369,
+        "logprob": 0.0,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 513,
+        "logprob": -1.1269531,
+        "special": false,
+        "text": " if"
+      },
+      {
+        "id": 368,
+        "logprob": 0.0,
+        "special": false,
+        "text": " you"
+      },
+      {
+        "id": 28742,
+        "logprob": -2.4921875,
+        "special": false,
+        "text": "'"
+      },
+      {
+        "id": 267,
+        "logprob": 0.0,
+        "special": false,
+        "text": "re"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is gradient descent?\n\nIt seems to me, that if you're"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_load.json b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..55056cfda8e26ef545b142785822249e767e41d5
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_load.json
@@ -0,0 +1,458 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -6.1445312,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -1.4648438,
+          "text": "is"
+        },
+        {
+          "id": 21135,
+          "logprob": -13.6875,
+          "text": "gradient"
+        },
+        {
+          "id": 24871,
+          "logprob": -1.6005859,
+          "text": "descent"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.39526367,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -0.640625,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.18774414,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 20910,
+          "logprob": -0.96484375,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 722,
+          "logprob": -0.003168106,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 24871,
+          "logprob": -0.16369629,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 349,
+          "logprob": -0.0881958,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 396,
+          "logprob": -0.76708984,
+          "special": false,
+          "text": " an"
+        },
+        {
+          "id": 18586,
+          "logprob": -0.57373047,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 9464,
+          "logprob": -0.11291504,
+          "special": false,
+          "text": " algorithm"
+        },
+        {
+          "id": 1307,
+          "logprob": -0.79589844,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 298,
+          "logprob": -0.1694336,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 26518,
+          "logprob": -0.34350586,
+          "special": false,
+          "text": " minimize"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is an optimization algorithm used to minimize"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -6.1445312,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -1.4677734,
+          "text": "is"
+        },
+        {
+          "id": 21135,
+          "logprob": -13.6875,
+          "text": "gradient"
+        },
+        {
+          "id": 24871,
+          "logprob": -1.6015625,
+          "text": "descent"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.39453125,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -0.6435547,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.18713379,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 20910,
+          "logprob": -0.9628906,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 722,
+          "logprob": -0.0032176971,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 24871,
+          "logprob": -0.16540527,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 349,
+          "logprob": -0.08898926,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 396,
+          "logprob": -0.765625,
+          "special": false,
+          "text": " an"
+        },
+        {
+          "id": 18586,
+          "logprob": -0.5708008,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 9464,
+          "logprob": -0.11401367,
+          "special": false,
+          "text": " algorithm"
+        },
+        {
+          "id": 1307,
+          "logprob": -0.7963867,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 298,
+          "logprob": -0.17028809,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 26518,
+          "logprob": -0.34326172,
+          "special": false,
+          "text": " minimize"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is an optimization algorithm used to minimize"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -6.140625,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -1.4658203,
+          "text": "is"
+        },
+        {
+          "id": 21135,
+          "logprob": -13.6796875,
+          "text": "gradient"
+        },
+        {
+          "id": 24871,
+          "logprob": -1.5898438,
+          "text": "descent"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.3955078,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -0.64501953,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.18493652,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 20910,
+          "logprob": -0.9580078,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 722,
+          "logprob": -0.0032176971,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 24871,
+          "logprob": -0.16552734,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 349,
+          "logprob": -0.08874512,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 396,
+          "logprob": -0.75878906,
+          "special": false,
+          "text": " an"
+        },
+        {
+          "id": 18586,
+          "logprob": -0.5703125,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 9464,
+          "logprob": -0.11236572,
+          "special": false,
+          "text": " algorithm"
+        },
+        {
+          "id": 1307,
+          "logprob": -0.79541016,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 298,
+          "logprob": -0.17102051,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 26518,
+          "logprob": -0.34326172,
+          "special": false,
+          "text": " minimize"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is an optimization algorithm used to minimize"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -6.1328125,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -1.4658203,
+          "text": "is"
+        },
+        {
+          "id": 21135,
+          "logprob": -13.6796875,
+          "text": "gradient"
+        },
+        {
+          "id": 24871,
+          "logprob": -1.5947266,
+          "text": "descent"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.39648438,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -0.6464844,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.18688965,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 20910,
+          "logprob": -0.9609375,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 722,
+          "logprob": -0.003168106,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 24871,
+          "logprob": -0.16601562,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 349,
+          "logprob": -0.088134766,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 396,
+          "logprob": -0.7597656,
+          "special": false,
+          "text": " an"
+        },
+        {
+          "id": 18586,
+          "logprob": -0.5708008,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 9464,
+          "logprob": -0.11291504,
+          "special": false,
+          "text": " algorithm"
+        },
+        {
+          "id": 1307,
+          "logprob": -0.7944336,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 298,
+          "logprob": -0.17102051,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 26518,
+          "logprob": -0.34399414,
+          "special": false,
+          "text": " minimize"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is an optimization algorithm used to minimize"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq.json b/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ca22e10e318cabb36e563b5e85008f6b2e23842
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq.json
@@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1824,
+        "logprob": -12.296875,
+        "text": "What"
+      },
+      {
+        "id": 349,
+        "logprob": -0.97216797,
+        "text": "is"
+      },
+      {
+        "id": 3534,
+        "logprob": -10.1796875,
+        "text": "deep"
+      },
+      {
+        "id": 5168,
+        "logprob": -0.9658203,
+        "text": "learning"
+      },
+      {
+        "id": 28804,
+        "logprob": -0.44384766,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.50878906,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.8876953,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 23229,
+        "logprob": -0.15124512,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 5168,
+        "logprob": -0.030288696,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 349,
+        "logprob": -0.16687012,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.17858887,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 19804,
+        "logprob": -0.8046875,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 302,
+        "logprob": -0.007205963,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 5599,
+        "logprob": -0.090026855,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 5168,
+        "logprob": -0.0030670166,
+        "special": false,
+        "text": " learning"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nDeep learning is a subset of machine learning"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq_all_params.json b/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..38ab7263a7667d2739afe590e55fce15ed5db69c
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 349,
+        "logprob": -13.921875,
+        "text": "is"
+      },
+      {
+        "id": 3534,
+        "logprob": -11.2265625,
+        "text": "deep"
+      },
+      {
+        "id": 5168,
+        "logprob": -2.3886719,
+        "text": "learning"
+      },
+      {
+        "id": 28804,
+        "logprob": -4.7109375,
+        "text": "?"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 23229,
+        "logprob": -0.5229492,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 17504,
+        "logprob": 0.0,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 349,
+        "logprob": -0.5151367,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 19804,
+        "logprob": 0.0,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 302,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 13253,
+        "logprob": -1.3359375,
+        "special": false,
+        "text": " Machine"
+      },
+      {
+        "id": 17504,
+        "logprob": 0.0,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 28725,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq_load.json b/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..329d73eeffe887a2f9299001ee449ffe337b1124
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq_load.json
@@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -12.296875,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -0.97216797,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -10.1796875,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.9658203,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.44384766,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.50878906,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.8876953,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 23229,
+          "logprob": -0.15136719,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.030273438,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 349,
+          "logprob": -0.1665039,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.1776123,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 19804,
+          "logprob": -0.8076172,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 302,
+          "logprob": -0.007183075,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5599,
+          "logprob": -0.090148926,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.0030670166,
+          "special": false,
+          "text": " learning"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -12.34375,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -0.96728516,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -10.1796875,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.97265625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.44189453,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.51220703,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.87402344,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 23229,
+          "logprob": -0.15039062,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.030288696,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 349,
+          "logprob": -0.1652832,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.17858887,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 19804,
+          "logprob": -0.81103516,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 302,
+          "logprob": -0.007183075,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5599,
+          "logprob": -0.08880615,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.0030612946,
+          "special": false,
+          "text": " learning"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -12.34375,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -0.96728516,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -10.1796875,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.97265625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.44189453,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.51220703,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.87402344,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 23229,
+          "logprob": -0.15039062,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.030288696,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 349,
+          "logprob": -0.1652832,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.17858887,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 19804,
+          "logprob": -0.81103516,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 302,
+          "logprob": -0.007183075,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5599,
+          "logprob": -0.08880615,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.0030612946,
+          "special": false,
+          "text": " learning"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -12.34375,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -0.96728516,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -10.1796875,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.97265625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.44189453,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.51220703,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.87402344,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 23229,
+          "logprob": -0.15039062,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.030288696,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 349,
+          "logprob": -0.1652832,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.17858887,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 19804,
+          "logprob": -0.81103516,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 302,
+          "logprob": -0.007183075,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5599,
+          "logprob": -0.08880615,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.0030612946,
+          "special": false,
+          "text": " learning"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json
new file mode 100644
index 0000000000000000000000000000000000000000..b835bf0752d0c8ed7057c329798d2c34d0b05148
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json
@@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1824,
+        "logprob": -9.2890625,
+        "text": "What"
+      },
+      {
+        "id": 349,
+        "logprob": -1.1503906,
+        "text": "is"
+      },
+      {
+        "id": 3534,
+        "logprob": -9.5859375,
+        "text": "deep"
+      },
+      {
+        "id": 5168,
+        "logprob": -1.3945312,
+        "text": "learning"
+      },
+      {
+        "id": 28804,
+        "logprob": -0.4555664,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.6953125,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.4777832,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 23229,
+        "logprob": -0.13256836,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 5168,
+        "logprob": -0.023849487,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 349,
+        "logprob": -0.13977051,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.14489746,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 19804,
+        "logprob": -0.63183594,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 302,
+        "logprob": -0.010314941,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 5599,
+        "logprob": -0.0635376,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 5168,
+        "logprob": -0.0028572083,
+        "special": false,
+        "text": " learning"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nDeep learning is a subset of machine learning"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..77c8859907e9bbea102a2559d96456494cc7ead0
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 349,
+        "logprob": -12.0546875,
+        "text": "is"
+      },
+      {
+        "id": 3534,
+        "logprob": -10.53125,
+        "text": "deep"
+      },
+      {
+        "id": 5168,
+        "logprob": -2.71875,
+        "text": "learning"
+      },
+      {
+        "id": 28804,
+        "logprob": -5.0078125,
+        "text": "?"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 23229,
+        "logprob": -0.18237305,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 17504,
+        "logprob": 0.0,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 349,
+        "logprob": 0.0,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 19804,
+        "logprob": 0.0,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 302,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 13253,
+        "logprob": -0.6040039,
+        "special": false,
+        "text": " Machine"
+      },
+      {
+        "id": 17504,
+        "logprob": 0.0,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 28725,
+        "logprob": -0.11621094,
+        "special": false,
+        "text": ","
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..959e3c5578eede6e8354d06409115dd9b95ee038
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json
@@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -9.2890625,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -1.1503906,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -9.5859375,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -1.3945312,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.4555664,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.6953125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.4777832,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 23229,
+          "logprob": -0.13232422,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.023834229,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 349,
+          "logprob": -0.13977051,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.14416504,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 19804,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 302,
+          "logprob": -0.010223389,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5599,
+          "logprob": -0.064208984,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.0028266907,
+          "special": false,
+          "text": " learning"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -9.2890625,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -1.1425781,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -9.59375,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -1.390625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.45532227,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.6953125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.48339844,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 23229,
+          "logprob": -0.13256836,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.02420044,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 349,
+          "logprob": -0.13977051,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.14501953,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 19804,
+          "logprob": -0.63134766,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 302,
+          "logprob": -0.010223389,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5599,
+          "logprob": -0.06427002,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.002817154,
+          "special": false,
+          "text": " learning"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -9.2890625,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -1.1425781,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -9.59375,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -1.390625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.45532227,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.6953125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.48339844,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 23229,
+          "logprob": -0.13256836,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.02420044,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 349,
+          "logprob": -0.13977051,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.14501953,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 19804,
+          "logprob": -0.63134766,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 302,
+          "logprob": -0.010223389,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5599,
+          "logprob": -0.06427002,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.002817154,
+          "special": false,
+          "text": " learning"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1824,
+          "logprob": -9.2890625,
+          "text": "What"
+        },
+        {
+          "id": 349,
+          "logprob": -1.1425781,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -9.59375,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -1.390625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.45532227,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.6953125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.48339844,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 23229,
+          "logprob": -0.13256836,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.02420044,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 349,
+          "logprob": -0.13977051,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.14501953,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 19804,
+          "logprob": -0.63134766,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 302,
+          "logprob": -0.010223389,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5599,
+          "logprob": -0.06427002,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 5168,
+          "logprob": -0.002817154,
+          "special": false,
+          "text": " learning"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json
index 037e0b1698471bac8577eb45eecec0332185d076..0096f807db44dda0a30cc2e3c13489e57299eaf4 100644
--- a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json
+++ b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json
@@ -8,13 +8,13 @@
     "tokens": [
       {
         "id": 54901,
-        "logprob": -0.72753906,
+        "logprob": -0.84765625,
         "special": false,
         "text": "beach"
       },
       {
         "id": 1,
-        "logprob": -0.011009216,
+        "logprob": -0.008666992,
         "special": true,
         "text": "<eos>"
       }
diff --git a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
index ab4f301560a57a09d3e7692ad3f6911a88dc1ad2..e3b5575c4e6c11c36ff67753244449b7a60b0539 100644
--- a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
+++ b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
@@ -8,49 +8,49 @@
     "tokens": [
       {
         "id": 2502,
-        "logprob": -1.734375,
+        "logprob": -1.7890625,
         "special": false,
         "text": "image"
       },
       {
         "id": 2196,
-        "logprob": -0.5756836,
+        "logprob": -0.53125,
         "special": false,
         "text": " result"
       },
       {
         "id": 604,
-        "logprob": -0.007843018,
+        "logprob": -0.0077209473,
         "special": false,
         "text": " for"
       },
       {
         "id": 12254,
-        "logprob": -1.7167969,
+        "logprob": -1.703125,
         "special": false,
         "text": " chicken"
       },
       {
         "id": 611,
-        "logprob": -0.17053223,
+        "logprob": -0.21582031,
         "special": false,
         "text": " on"
       },
       {
         "id": 573,
-        "logprob": -0.7626953,
+        "logprob": -0.734375,
         "special": false,
         "text": " the"
       },
       {
         "id": 8318,
-        "logprob": -0.02709961,
+        "logprob": -0.026000977,
         "special": false,
         "text": " beach"
       },
       {
         "id": 1,
-        "logprob": -0.20739746,
+        "logprob": -0.2109375,
         "special": true,
         "text": "<eos>"
       }
diff --git a/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_all_params.json b/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_all_params.json
index 221ff13d9e036f15ccff5e2c2b26c0dc78b31622..b407ef7147a461040d244dbd59ca0cfead10ac53 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_all_params.json
@@ -19,25 +19,25 @@
     "tokens": [
       {
         "id": 284,
-        "logprob": -0.19421387,
+        "logprob": -0.28955078,
         "special": false,
         "text": " to"
       },
       {
         "id": 3758,
-        "logprob": -0.62597656,
+        "logprob": -0.7739258,
         "special": false,
         "text": " send"
       },
       {
         "id": 1366,
-        "logprob": -0.87060547,
+        "logprob": -0.85253906,
         "special": false,
         "text": " data"
       },
       {
         "id": 625,
-        "logprob": -0.88427734,
+        "logprob": -0.8984375,
         "special": false,
         "text": " over"
       },
@@ -49,7 +49,7 @@
       },
       {
         "id": 3127,
-        "logprob": -1.9462891,
+        "logprob": -1.9404297,
         "special": false,
         "text": " network"
       }
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfabe3c65cd4e43abe024eccf1bfbf19b14d8d50
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
@@ -0,0 +1,109 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1724,
+        "logprob": null,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -0.6201172,
+        "text": "is"
+      },
+      {
+        "id": 16030,
+        "logprob": -13.6484375,
+        "text": "gradient"
+      },
+      {
+        "id": 26815,
+        "logprob": -0.003894806,
+        "text": "descent"
+      },
+      {
+        "id": 29973,
+        "logprob": -2.6386719,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -6.46875,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -6.6875,
+        "text": "\n"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 25584,
+        "logprob": -0.008979797,
+        "special": false,
+        "text": "Grad"
+      },
+      {
+        "id": 993,
+        "logprob": -8.34465e-07,
+        "special": false,
+        "text": "ient"
+      },
+      {
+        "id": 26815,
+        "logprob": -0.0009407997,
+        "special": false,
+        "text": " descent"
+      },
+      {
+        "id": 338,
+        "logprob": -0.0003838539,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 385,
+        "logprob": -0.24499512,
+        "special": false,
+        "text": " an"
+      },
+      {
+        "id": 13883,
+        "logprob": -0.010406494,
+        "special": false,
+        "text": " optimization"
+      },
+      {
+        "id": 5687,
+        "logprob": -0.00024354458,
+        "special": false,
+        "text": " algorithm"
+      },
+      {
+        "id": 15574,
+        "logprob": -0.6582031,
+        "special": false,
+        "text": " commonly"
+      },
+      {
+        "id": 1304,
+        "logprob": -0.00092840195,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 297,
+        "logprob": -0.19470215,
+        "special": false,
+        "text": " in"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Gradient descent is an optimization algorithm commonly used in"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..b524859fdc70c2b479dc6034ea58f896c7eee037
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 338,
+        "logprob": null,
+        "text": "is"
+      },
+      {
+        "id": 16030,
+        "logprob": -13.328125,
+        "text": "gradient"
+      },
+      {
+        "id": 26815,
+        "logprob": -0.24023438,
+        "text": "descent"
+      },
+      {
+        "id": 29973,
+        "logprob": -3.1386719,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -3.0878906,
+        "text": "\n"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 25584,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Grad"
+      },
+      {
+        "id": 993,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ient"
+      },
+      {
+        "id": 2726,
+        "logprob": 0.0,
+        "special": false,
+        "text": " Des"
+      },
+      {
+        "id": 1760,
+        "logprob": 0.0,
+        "special": false,
+        "text": "cent"
+      },
+      {
+        "id": 313,
+        "logprob": -0.12322998,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 29954,
+        "logprob": 0.0,
+        "special": false,
+        "text": "G"
+      },
+      {
+        "id": 29928,
+        "logprob": 0.0,
+        "special": false,
+        "text": "D"
+      },
+      {
+        "id": 29897,
+        "logprob": 0.0,
+        "special": false,
+        "text": ")"
+      },
+      {
+        "id": 338,
+        "logprob": -0.6040039,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 385,
+        "logprob": -0.1796875,
+        "special": false,
+        "text": " an"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is gradient descent?\nGradient Descent (GD) is an"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c977d8b2be1da0362a26a20dd03209a922b3b92
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
@@ -0,0 +1,438 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1724,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.6201172,
+          "text": "is"
+        },
+        {
+          "id": 16030,
+          "logprob": -13.6484375,
+          "text": "gradient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.003894806,
+          "text": "descent"
+        },
+        {
+          "id": 29973,
+          "logprob": -2.6386719,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -6.46875,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -6.6875,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25584,
+          "logprob": -0.008979797,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 993,
+          "logprob": -8.34465e-07,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.00097084045,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 338,
+          "logprob": -0.0003838539,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 385,
+          "logprob": -0.23840332,
+          "special": false,
+          "text": " an"
+        },
+        {
+          "id": 13883,
+          "logprob": -0.010406494,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 5687,
+          "logprob": -0.0002501011,
+          "special": false,
+          "text": " algorithm"
+        },
+        {
+          "id": 15574,
+          "logprob": -0.6582031,
+          "special": false,
+          "text": " commonly"
+        },
+        {
+          "id": 1304,
+          "logprob": -0.00092840195,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 297,
+          "logprob": -0.18933105,
+          "special": false,
+          "text": " in"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1724,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.6113281,
+          "text": "is"
+        },
+        {
+          "id": 16030,
+          "logprob": -13.6640625,
+          "text": "gradient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.003929138,
+          "text": "descent"
+        },
+        {
+          "id": 29973,
+          "logprob": -2.625,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -6.484375,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -6.6875,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25584,
+          "logprob": -0.009017944,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 993,
+          "logprob": -9.536743e-07,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.00097084045,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 338,
+          "logprob": -0.0003838539,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 385,
+          "logprob": -0.24499512,
+          "special": false,
+          "text": " an"
+        },
+        {
+          "id": 13883,
+          "logprob": -0.010406494,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 5687,
+          "logprob": -0.0002501011,
+          "special": false,
+          "text": " algorithm"
+        },
+        {
+          "id": 15574,
+          "logprob": -0.6435547,
+          "special": false,
+          "text": " commonly"
+        },
+        {
+          "id": 1304,
+          "logprob": -0.0009279251,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 297,
+          "logprob": -0.18933105,
+          "special": false,
+          "text": " in"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1724,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.609375,
+          "text": "is"
+        },
+        {
+          "id": 16030,
+          "logprob": -13.671875,
+          "text": "gradient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.0040016174,
+          "text": "descent"
+        },
+        {
+          "id": 29973,
+          "logprob": -2.6230469,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -6.453125,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -6.6875,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25584,
+          "logprob": -0.008956909,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 993,
+          "logprob": -8.34465e-07,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.0009407997,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 338,
+          "logprob": -0.0003721714,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 385,
+          "logprob": -0.24499512,
+          "special": false,
+          "text": " an"
+        },
+        {
+          "id": 13883,
+          "logprob": -0.010406494,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 5687,
+          "logprob": -0.0002501011,
+          "special": false,
+          "text": " algorithm"
+        },
+        {
+          "id": 15574,
+          "logprob": -0.6435547,
+          "special": false,
+          "text": " commonly"
+        },
+        {
+          "id": 1304,
+          "logprob": -0.00092601776,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 297,
+          "logprob": -0.19177246,
+          "special": false,
+          "text": " in"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1724,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.609375,
+          "text": "is"
+        },
+        {
+          "id": 16030,
+          "logprob": -13.6640625,
+          "text": "gradient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.0038967133,
+          "text": "descent"
+        },
+        {
+          "id": 29973,
+          "logprob": -2.6347656,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -6.453125,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -6.6875,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25584,
+          "logprob": -0.008979797,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 993,
+          "logprob": -9.536743e-07,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.0009407997,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 338,
+          "logprob": -0.00038409233,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 385,
+          "logprob": -0.24499512,
+          "special": false,
+          "text": " an"
+        },
+        {
+          "id": 13883,
+          "logprob": -0.010414124,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 5687,
+          "logprob": -0.00024354458,
+          "special": false,
+          "text": " algorithm"
+        },
+        {
+          "id": 15574,
+          "logprob": -0.6435547,
+          "special": false,
+          "text": " commonly"
+        },
+        {
+          "id": 1304,
+          "logprob": -0.0009279251,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 297,
+          "logprob": -0.19470215,
+          "special": false,
+          "text": " in"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json
index 89e02c07474a3d42b84021816b4fa6bc3b7435ca..164e3cf28f1a5ade9d66ac7226a26c162d7a9736 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json
@@ -11,17 +11,17 @@
       },
       {
         "id": 1459,
-        "logprob": -5.6328125,
+        "logprob": -5.625,
         "text": " print"
       },
       {
         "id": 81,
-        "logprob": -1.6035156,
+        "logprob": -1.6064453,
         "text": "_"
       },
       {
         "id": 7656,
-        "logprob": -5.9882812,
+        "logprob": -5.9921875,
         "text": "hello"
       }
     ],
@@ -29,7 +29,7 @@
     "tokens": [
       {
         "id": 2262,
-        "logprob": -0.042999268,
+        "logprob": -0.045715332,
         "special": false,
         "text": "():"
       },
@@ -59,7 +59,7 @@
       },
       {
         "id": 10896,
-        "logprob": -0.38549805,
+        "logprob": -0.3659668,
         "special": false,
         "text": " World"
       },
@@ -113,7 +113,7 @@
       },
       {
         "id": 426,
-        "logprob": 0.0,
+        "logprob": -0.051635742,
         "special": false,
         "text": "name"
       },
@@ -323,7 +323,7 @@
       },
       {
         "id": 313,
-        "logprob": -0.6328125,
+        "logprob": -0.6933594,
         "special": false,
         "text": " \""
       },
@@ -387,7 +387,8 @@
         "special": false,
         "text": " print"
       }
-    ]
+    ],
+    "top_tokens": null
   },
   "generated_text": "():\n    print(\"Hello World\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name)\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \" \" + str(age))\n\ndef print"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
index 3811727236cf4f7041d1af9200bcabb56c177c47..412b19b49b9010aba7a82feed3a2b9e44a878eac 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
@@ -11,12 +11,12 @@
       },
       {
         "id": 1489,
-        "logprob": -5.2617188,
+        "logprob": -5.265625,
         "text": " print"
       },
       {
         "id": 100,
-        "logprob": -0.38476562,
+        "logprob": -0.38305664,
         "text": "_"
       },
       {
@@ -53,13 +53,13 @@
       },
       {
         "id": 8302,
-        "logprob": -0.28125,
+        "logprob": -0.26611328,
         "special": false,
         "text": "Hello"
       },
       {
         "id": 10914,
-        "logprob": -0.79248047,
+        "logprob": -0.7734375,
         "special": false,
         "text": " World"
       },
@@ -71,7 +71,7 @@
       },
       {
         "id": 222,
-        "logprob": -0.0619812,
+        "logprob": -0.054870605,
         "special": false,
         "text": "\n"
       },
@@ -83,7 +83,7 @@
       },
       {
         "id": 610,
-        "logprob": -0.4091797,
+        "logprob": -0.4152832,
         "special": false,
         "text": "def"
       },
@@ -113,7 +113,7 @@
       },
       {
         "id": 444,
-        "logprob": -0.21655273,
+        "logprob": -0.21618652,
         "special": false,
         "text": "name"
       },
@@ -160,40 +160,40 @@
         "text": "Hello"
       },
       {
-        "id": 332,
-        "logprob": -0.034698486,
+        "id": 925,
+        "logprob": -3.3476562,
         "special": false,
-        "text": " \""
+        "text": " %"
       },
       {
-        "id": 494,
+        "id": 120,
         "logprob": 0.0,
         "special": false,
-        "text": " +"
+        "text": "s"
       },
       {
-        "id": 655,
-        "logprob": 0.0,
+        "id": 11571,
+        "logprob": -0.08892822,
         "special": false,
-        "text": " name"
+        "text": "!\""
       },
       {
-        "id": 494,
-        "logprob": -0.20141602,
+        "id": 925,
+        "logprob": 0.0,
         "special": false,
-        "text": " +"
+        "text": " %"
       },
       {
-        "id": 332,
+        "id": 655,
         "logprob": 0.0,
         "special": false,
-        "text": " \""
+        "text": " name"
       },
       {
-        "id": 16013,
+        "id": 46,
         "logprob": 0.0,
         "special": false,
-        "text": "!\")"
+        "text": ")"
       },
       {
         "id": 222,
@@ -251,7 +251,7 @@
       },
       {
         "id": 400,
-        "logprob": 0.0,
+        "logprob": -0.074279785,
         "special": false,
         "text": "age"
       },
@@ -310,85 +310,85 @@
         "text": "Hello"
       },
       {
-        "id": 332,
+        "id": 925,
         "logprob": 0.0,
         "special": false,
-        "text": " \""
+        "text": " %"
       },
       {
-        "id": 494,
+        "id": 120,
         "logprob": 0.0,
         "special": false,
-        "text": " +"
+        "text": "s"
       },
       {
-        "id": 655,
-        "logprob": 0.0,
+        "id": 49,
+        "logprob": -0.07891846,
         "special": false,
-        "text": " name"
+        "text": ","
       },
       {
-        "id": 494,
+        "id": 863,
         "logprob": 0.0,
         "special": false,
-        "text": " +"
+        "text": " you"
       },
       {
-        "id": 3021,
-        "logprob": -0.5761719,
+        "id": 904,
+        "logprob": 0.0,
         "special": false,
-        "text": " \","
+        "text": " are"
       },
       {
-        "id": 863,
+        "id": 925,
         "logprob": 0.0,
         "special": false,
-        "text": " you"
+        "text": " %"
       },
       {
-        "id": 904,
+        "id": 105,
         "logprob": 0.0,
         "special": false,
-        "text": " are"
+        "text": "d"
       },
       {
-        "id": 332,
+        "id": 11339,
         "logprob": 0.0,
         "special": false,
-        "text": " \""
+        "text": " years"
       },
       {
-        "id": 494,
+        "id": 3627,
         "logprob": 0.0,
         "special": false,
-        "text": " +"
+        "text": " old"
       },
       {
-        "id": 615,
+        "id": 11571,
         "logprob": 0.0,
         "special": false,
-        "text": " str"
+        "text": "!\""
       },
       {
-        "id": 45,
+        "id": 925,
         "logprob": 0.0,
         "special": false,
-        "text": "("
+        "text": " %"
       },
       {
-        "id": 400,
+        "id": 327,
         "logprob": 0.0,
         "special": false,
-        "text": "age"
+        "text": " ("
       },
       {
-        "id": 46,
+        "id": 444,
         "logprob": 0.0,
         "special": false,
-        "text": ")"
+        "text": "name"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name + \"!\")\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \", you are \" + str(age)"
+  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello %s!\" % name)\n\ndef print_hello_name_age(name, age):\n    print(\"Hello %s, you are %d years old!\" % (name"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
index 5e537bb7ff2e3fd5874db253cfb2527530887616..8548e376c08509c8b35c58695b69c08385a69dfd 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
@@ -1,8 +1,8 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "length",
-    "generated_tokens": 20,
+    "finish_reason": "eos_token",
+    "generated_tokens": 2,
     "prefill": [
       {
         "id": 589,
@@ -11,57 +11,57 @@
       },
       {
         "id": 3226,
-        "logprob": -8.5859375,
+        "logprob": -9.0234375,
         "text": " ge"
       },
       {
         "id": 21017,
-        "logprob": -7.5859375,
+        "logprob": -9.0859375,
         "text": "ometric"
       },
       {
         "id": 81,
-        "logprob": -0.2668457,
+        "logprob": -0.25585938,
         "text": "_"
       },
       {
         "id": 6009,
-        "logprob": -1.6416016,
+        "logprob": -2.1972656,
         "text": "mean"
       },
       {
         "id": 26,
-        "logprob": -0.22705078,
+        "logprob": -0.2998047,
         "text": "("
       },
       {
         "id": 62,
-        "logprob": -5.2304688,
+        "logprob": -5.6445312,
         "text": "L"
       },
       {
         "id": 44,
-        "logprob": -3.0976562,
+        "logprob": -3.0839844,
         "text": ":"
       },
       {
         "id": 1682,
-        "logprob": -1.1044922,
+        "logprob": -0.6748047,
         "text": " List"
       },
       {
         "id": 77,
-        "logprob": -0.14294434,
+        "logprob": -0.3864746,
         "text": "["
       },
       {
         "id": 1808,
-        "logprob": -0.32299805,
+        "logprob": -0.9355469,
         "text": "float"
       },
       {
         "id": 10794,
-        "logprob": -2.8164062,
+        "logprob": -2.5371094,
         "text": "]):"
       }
     ],
@@ -69,126 +69,18 @@
     "tokens": [
       {
         "id": 284,
-        "logprob": -0.1282959,
+        "logprob": -1.1679688,
         "special": false,
         "text": "\n   "
       },
       {
-        "id": 1524,
-        "logprob": -0.97998047,
-        "special": false,
-        "text": " \"\"\""
-      },
-      {
-        "id": 284,
-        "logprob": -0.7006836,
-        "special": false,
-        "text": "\n   "
-      },
-      {
-        "id": 14883,
-        "logprob": -2.1933594,
-        "special": false,
-        "text": " Calculate"
-      },
-      {
-        "id": 322,
-        "logprob": -0.2697754,
-        "special": false,
-        "text": " the"
-      },
-      {
-        "id": 3226,
-        "logprob": -0.0836792,
-        "special": false,
-        "text": " ge"
-      },
-      {
-        "id": 21017,
-        "logprob": -0.018737793,
-        "special": false,
-        "text": "ometric"
-      },
-      {
-        "id": 5651,
-        "logprob": -0.028640747,
-        "special": false,
-        "text": " mean"
-      },
-      {
-        "id": 432,
-        "logprob": -0.29467773,
-        "special": false,
-        "text": " of"
-      },
-      {
-        "id": 312,
-        "logprob": -0.31518555,
-        "special": false,
-        "text": " a"
-      },
-      {
-        "id": 1149,
-        "logprob": -0.20605469,
-        "special": false,
-        "text": " list"
-      },
-      {
-        "id": 432,
-        "logprob": -0.23254395,
-        "special": false,
-        "text": " of"
-      },
-      {
-        "id": 7515,
-        "logprob": -0.4489746,
-        "special": false,
-        "text": " numbers"
-      },
-      {
-        "id": 32,
-        "logprob": -0.6044922,
-        "special": false,
-        "text": "."
-      },
-      {
-        "id": 446,
-        "logprob": -0.63964844,
-        "special": false,
-        "text": "\n\n   "
-      },
-      {
-        "id": 499,
-        "logprob": -1.1953125,
-        "special": false,
-        "text": " :"
-      },
-      {
-        "id": 753,
-        "logprob": -0.03515625,
-        "special": false,
-        "text": "param"
-      },
-      {
-        "id": 498,
-        "logprob": -0.06311035,
-        "special": false,
-        "text": " L"
-      },
-      {
-        "id": 44,
-        "logprob": -0.003414154,
-        "special": false,
-        "text": ":"
-      },
-      {
-        "id": 1682,
-        "logprob": -1.3310547,
-        "special": false,
-        "text": " List"
+        "id": 0,
+        "logprob": null,
+        "special": true,
+        "text": "<|endoftext|>"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a list of numbers.\n\n    :param L: List"
+  "generated_text": "\n   "
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
index bf0f5146ff202e7ff276bbc9beb306c236980c92..a6b805342aa54bf9e22a662026be90e19c4a0b8e 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
@@ -1,8 +1,8 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "length",
-    "generated_tokens": 20,
+    "finish_reason": "eos_token",
+    "generated_tokens": 2,
     "prefill": [
       {
         "id": 589,
@@ -11,57 +11,57 @@
       },
       {
         "id": 3226,
-        "logprob": -8.5859375,
+        "logprob": -9.015625,
         "text": " ge"
       },
       {
         "id": 21017,
-        "logprob": -7.5898438,
+        "logprob": -9.0859375,
         "text": "ometric"
       },
       {
         "id": 81,
-        "logprob": -0.26586914,
+        "logprob": -0.25585938,
         "text": "_"
       },
       {
         "id": 6009,
-        "logprob": -1.6347656,
+        "logprob": -2.2304688,
         "text": "mean"
       },
       {
         "id": 26,
-        "logprob": -0.22705078,
+        "logprob": -0.29760742,
         "text": "("
       },
       {
         "id": 62,
-        "logprob": -5.2382812,
+        "logprob": -5.6796875,
         "text": "L"
       },
       {
         "id": 44,
-        "logprob": -3.0996094,
+        "logprob": -3.0742188,
         "text": ":"
       },
       {
         "id": 1682,
-        "logprob": -1.1025391,
+        "logprob": -0.67626953,
         "text": " List"
       },
       {
         "id": 77,
-        "logprob": -0.14294434,
+        "logprob": -0.38842773,
         "text": "["
       },
       {
         "id": 1808,
-        "logprob": -0.32226562,
+        "logprob": -0.9165039,
         "text": "float"
       },
       {
         "id": 10794,
-        "logprob": -2.8164062,
+        "logprob": -2.5527344,
         "text": "]):"
       }
     ],
@@ -69,126 +69,18 @@
     "tokens": [
       {
         "id": 284,
-        "logprob": 0.0,
+        "logprob": -0.048583984,
         "special": false,
         "text": "\n   "
       },
       {
-        "id": 442,
-        "logprob": -1.3134766,
-        "special": false,
-        "text": " return"
-      },
-      {
-        "id": 11665,
-        "logprob": -0.10021973,
-        "special": false,
-        "text": " reduce"
-      },
-      {
-        "id": 26,
-        "logprob": 0.0,
-        "special": false,
-        "text": "("
-      },
-      {
-        "id": 5962,
-        "logprob": 0.0,
-        "special": false,
-        "text": "lambda"
-      },
-      {
-        "id": 816,
-        "logprob": 0.0,
-        "special": false,
-        "text": " x"
-      },
-      {
-        "id": 30,
-        "logprob": 0.0,
-        "special": false,
-        "text": ","
-      },
-      {
-        "id": 533,
-        "logprob": 0.0,
-        "special": false,
-        "text": " y"
-      },
-      {
-        "id": 44,
-        "logprob": 0.0,
-        "special": false,
-        "text": ":"
-      },
-      {
-        "id": 816,
-        "logprob": 0.0,
-        "special": false,
-        "text": " x"
-      },
-      {
-        "id": 319,
-        "logprob": -0.42871094,
-        "special": false,
-        "text": " *"
-      },
-      {
-        "id": 533,
-        "logprob": 0.0,
-        "special": false,
-        "text": " y"
-      },
-      {
-        "id": 30,
-        "logprob": 0.0,
-        "special": false,
-        "text": ","
-      },
-      {
-        "id": 498,
-        "logprob": 0.0,
-        "special": false,
-        "text": " L"
-      },
-      {
-        "id": 27,
-        "logprob": 0.0,
-        "special": false,
-        "text": ")"
-      },
-      {
-        "id": 1115,
-        "logprob": 0.0,
-        "special": false,
-        "text": " **"
-      },
-      {
-        "id": 308,
-        "logprob": 0.0,
-        "special": false,
-        "text": " ("
-      },
-      {
-        "id": 35,
-        "logprob": 0.0,
-        "special": false,
-        "text": "1"
-      },
-      {
-        "id": 32,
-        "logprob": -0.31323242,
-        "special": false,
-        "text": "."
-      },
-      {
-        "id": 34,
-        "logprob": 0.0,
-        "special": false,
-        "text": "0"
+        "id": 0,
+        "logprob": null,
+        "special": true,
+        "text": "<|endoftext|>"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "\n    return reduce(lambda x, y: x * y, L) ** (1.0"
+  "generated_text": "\n   "
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
index 46a21ed83c2232293ffee3ddf01f75a0a661c685..d9072c527d318b04ea90cf2c49328154f645ed84 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
@@ -2,8 +2,8 @@
   {
     "details": {
       "best_of_sequences": null,
-      "finish_reason": "length",
-      "generated_tokens": 10,
+      "finish_reason": "eos_token",
+      "generated_tokens": 2,
       "prefill": [
         {
           "id": 589,
@@ -12,57 +12,57 @@
         },
         {
           "id": 3226,
-          "logprob": -8.5859375,
+          "logprob": -8.9453125,
           "text": " ge"
         },
         {
           "id": 21017,
-          "logprob": -7.5820312,
+          "logprob": -8.8515625,
           "text": "ometric"
         },
         {
           "id": 81,
-          "logprob": -0.26708984,
+          "logprob": -0.22033691,
           "text": "_"
         },
         {
           "id": 6009,
-          "logprob": -1.6386719,
+          "logprob": -1.2939453,
           "text": "mean"
         },
         {
           "id": 26,
-          "logprob": -0.22717285,
+          "logprob": -0.25268555,
           "text": "("
         },
         {
           "id": 62,
-          "logprob": -5.234375,
+          "logprob": -4.796875,
           "text": "L"
         },
         {
           "id": 44,
-          "logprob": -3.1015625,
+          "logprob": -3.796875,
           "text": ":"
         },
         {
           "id": 1682,
-          "logprob": -1.1083984,
+          "logprob": -0.8066406,
           "text": " List"
         },
         {
           "id": 77,
-          "logprob": -0.14294434,
+          "logprob": -0.22644043,
           "text": "["
         },
         {
           "id": 1808,
-          "logprob": -0.32592773,
+          "logprob": -0.46166992,
           "text": "float"
         },
         {
           "id": 10794,
-          "logprob": -2.8164062,
+          "logprob": -3.0253906,
           "text": "]):"
         }
       ],
@@ -70,74 +70,26 @@
       "tokens": [
         {
           "id": 284,
-          "logprob": -0.12817383,
+          "logprob": -0.046844482,
           "special": false,
           "text": "\n   "
         },
         {
-          "id": 1524,
-          "logprob": -0.9863281,
-          "special": false,
-          "text": " \"\"\""
-        },
-        {
-          "id": 284,
-          "logprob": -0.7011719,
-          "special": false,
-          "text": "\n   "
-        },
-        {
-          "id": 14883,
-          "logprob": -2.2050781,
-          "special": false,
-          "text": " Calculate"
-        },
-        {
-          "id": 322,
-          "logprob": -0.2668457,
-          "special": false,
-          "text": " the"
-        },
-        {
-          "id": 3226,
-          "logprob": -0.08465576,
-          "special": false,
-          "text": " ge"
-        },
-        {
-          "id": 21017,
-          "logprob": -0.019012451,
-          "special": false,
-          "text": "ometric"
-        },
-        {
-          "id": 5651,
-          "logprob": -0.028625488,
-          "special": false,
-          "text": " mean"
-        },
-        {
-          "id": 432,
-          "logprob": -0.29418945,
-          "special": false,
-          "text": " of"
-        },
-        {
-          "id": 312,
-          "logprob": -0.3161621,
-          "special": false,
-          "text": " a"
+          "id": 0,
+          "logprob": null,
+          "special": true,
+          "text": "<|endoftext|>"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
+    "generated_text": "\n   "
   },
   {
     "details": {
       "best_of_sequences": null,
-      "finish_reason": "length",
-      "generated_tokens": 10,
+      "finish_reason": "eos_token",
+      "generated_tokens": 2,
       "prefill": [
         {
           "id": 589,
@@ -146,57 +98,57 @@
         },
         {
           "id": 3226,
-          "logprob": -8.5859375,
+          "logprob": -8.9375,
           "text": " ge"
         },
         {
           "id": 21017,
-          "logprob": -7.59375,
+          "logprob": -8.8515625,
           "text": "ometric"
         },
         {
           "id": 81,
-          "logprob": -0.26953125,
+          "logprob": -0.21826172,
           "text": "_"
         },
         {
           "id": 6009,
-          "logprob": -1.640625,
+          "logprob": -1.2871094,
           "text": "mean"
         },
         {
           "id": 26,
-          "logprob": -0.22705078,
+          "logprob": -0.25390625,
           "text": "("
         },
         {
           "id": 62,
-          "logprob": -5.234375,
+          "logprob": -4.8085938,
           "text": "L"
         },
         {
           "id": 44,
-          "logprob": -3.1132812,
+          "logprob": -3.7890625,
           "text": ":"
         },
         {
           "id": 1682,
-          "logprob": -1.1123047,
+          "logprob": -0.8076172,
           "text": " List"
         },
         {
           "id": 77,
-          "logprob": -0.14294434,
+          "logprob": -0.22302246,
           "text": "["
         },
         {
           "id": 1808,
-          "logprob": -0.32299805,
+          "logprob": -0.46435547,
           "text": "float"
         },
         {
           "id": 10794,
-          "logprob": -2.8164062,
+          "logprob": -3.0234375,
           "text": "]):"
         }
       ],
@@ -204,74 +156,26 @@
       "tokens": [
         {
           "id": 284,
-          "logprob": -0.12854004,
+          "logprob": -0.046722412,
           "special": false,
           "text": "\n   "
         },
         {
-          "id": 1524,
-          "logprob": -0.9897461,
-          "special": false,
-          "text": " \"\"\""
-        },
-        {
-          "id": 284,
-          "logprob": -0.69970703,
-          "special": false,
-          "text": "\n   "
-        },
-        {
-          "id": 14883,
-          "logprob": -2.2050781,
-          "special": false,
-          "text": " Calculate"
-        },
-        {
-          "id": 322,
-          "logprob": -0.2668457,
-          "special": false,
-          "text": " the"
-        },
-        {
-          "id": 3226,
-          "logprob": -0.08496094,
-          "special": false,
-          "text": " ge"
-        },
-        {
-          "id": 21017,
-          "logprob": -0.019012451,
-          "special": false,
-          "text": "ometric"
-        },
-        {
-          "id": 5651,
-          "logprob": -0.029037476,
-          "special": false,
-          "text": " mean"
-        },
-        {
-          "id": 432,
-          "logprob": -0.2939453,
-          "special": false,
-          "text": " of"
-        },
-        {
-          "id": 312,
-          "logprob": -0.31591797,
-          "special": false,
-          "text": " a"
+          "id": 0,
+          "logprob": null,
+          "special": true,
+          "text": "<|endoftext|>"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
+    "generated_text": "\n   "
   },
   {
     "details": {
       "best_of_sequences": null,
-      "finish_reason": "length",
-      "generated_tokens": 10,
+      "finish_reason": "eos_token",
+      "generated_tokens": 2,
       "prefill": [
         {
           "id": 589,
@@ -280,57 +184,57 @@
         },
         {
           "id": 3226,
-          "logprob": -8.5859375,
+          "logprob": -8.9453125,
           "text": " ge"
         },
         {
           "id": 21017,
-          "logprob": -7.5859375,
+          "logprob": -8.8515625,
           "text": "ometric"
         },
         {
           "id": 81,
-          "logprob": -0.26586914,
+          "logprob": -0.21813965,
           "text": "_"
         },
         {
           "id": 6009,
-          "logprob": -1.6347656,
+          "logprob": -1.2744141,
           "text": "mean"
         },
         {
           "id": 26,
-          "logprob": -0.22766113,
+          "logprob": -0.2512207,
           "text": "("
         },
         {
           "id": 62,
-          "logprob": -5.2265625,
+          "logprob": -4.8046875,
           "text": "L"
         },
         {
           "id": 44,
-          "logprob": -3.0976562,
+          "logprob": -3.7851562,
           "text": ":"
         },
         {
           "id": 1682,
-          "logprob": -1.1025391,
+          "logprob": -0.81396484,
           "text": " List"
         },
         {
           "id": 77,
-          "logprob": -0.1427002,
+          "logprob": -0.22570801,
           "text": "["
         },
         {
           "id": 1808,
-          "logprob": -0.32592773,
+          "logprob": -0.46044922,
           "text": "float"
         },
         {
           "id": 10794,
-          "logprob": -2.8164062,
+          "logprob": -3.0234375,
           "text": "]):"
         }
       ],
@@ -338,74 +242,26 @@
       "tokens": [
         {
           "id": 284,
-          "logprob": -0.13012695,
+          "logprob": -0.04650879,
           "special": false,
           "text": "\n   "
         },
         {
-          "id": 1524,
-          "logprob": -0.98046875,
-          "special": false,
-          "text": " \"\"\""
-        },
-        {
-          "id": 284,
-          "logprob": -0.69921875,
-          "special": false,
-          "text": "\n   "
-        },
-        {
-          "id": 14883,
-          "logprob": -2.1992188,
-          "special": false,
-          "text": " Calculate"
-        },
-        {
-          "id": 322,
-          "logprob": -0.2668457,
-          "special": false,
-          "text": " the"
-        },
-        {
-          "id": 3226,
-          "logprob": -0.083496094,
-          "special": false,
-          "text": " ge"
-        },
-        {
-          "id": 21017,
-          "logprob": -0.01902771,
-          "special": false,
-          "text": "ometric"
-        },
-        {
-          "id": 5651,
-          "logprob": -0.029006958,
-          "special": false,
-          "text": " mean"
-        },
-        {
-          "id": 432,
-          "logprob": -0.29248047,
-          "special": false,
-          "text": " of"
-        },
-        {
-          "id": 312,
-          "logprob": -0.3161621,
-          "special": false,
-          "text": " a"
+          "id": 0,
+          "logprob": null,
+          "special": true,
+          "text": "<|endoftext|>"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
+    "generated_text": "\n   "
   },
   {
     "details": {
       "best_of_sequences": null,
-      "finish_reason": "length",
-      "generated_tokens": 10,
+      "finish_reason": "eos_token",
+      "generated_tokens": 2,
       "prefill": [
         {
           "id": 589,
@@ -414,57 +270,57 @@
         },
         {
           "id": 3226,
-          "logprob": -8.5859375,
+          "logprob": -8.9453125,
           "text": " ge"
         },
         {
           "id": 21017,
-          "logprob": -7.5859375,
+          "logprob": -8.8515625,
           "text": "ometric"
         },
         {
           "id": 81,
-          "logprob": -0.26904297,
+          "logprob": -0.21960449,
           "text": "_"
         },
         {
           "id": 6009,
-          "logprob": -1.6386719,
+          "logprob": -1.2890625,
           "text": "mean"
         },
         {
           "id": 26,
-          "logprob": -0.22705078,
+          "logprob": -0.25073242,
           "text": "("
         },
         {
           "id": 62,
-          "logprob": -5.234375,
+          "logprob": -4.8085938,
           "text": "L"
         },
         {
           "id": 44,
-          "logprob": -3.1132812,
+          "logprob": -3.8046875,
           "text": ":"
         },
         {
           "id": 1682,
-          "logprob": -1.1074219,
+          "logprob": -0.8071289,
           "text": " List"
         },
         {
           "id": 77,
-          "logprob": -0.14477539,
+          "logprob": -0.22570801,
           "text": "["
         },
         {
           "id": 1808,
-          "logprob": -0.3256836,
+          "logprob": -0.46118164,
           "text": "float"
         },
         {
           "id": 10794,
-          "logprob": -2.8027344,
+          "logprob": -3.0097656,
           "text": "]):"
         }
       ],
@@ -472,67 +328,19 @@
       "tokens": [
         {
           "id": 284,
-          "logprob": -0.12915039,
+          "logprob": -0.046539307,
           "special": false,
           "text": "\n   "
         },
         {
-          "id": 1524,
-          "logprob": -0.98535156,
-          "special": false,
-          "text": " \"\"\""
-        },
-        {
-          "id": 284,
-          "logprob": -0.69921875,
-          "special": false,
-          "text": "\n   "
-        },
-        {
-          "id": 14883,
-          "logprob": -2.2011719,
-          "special": false,
-          "text": " Calculate"
-        },
-        {
-          "id": 322,
-          "logprob": -0.26708984,
-          "special": false,
-          "text": " the"
-        },
-        {
-          "id": 3226,
-          "logprob": -0.08502197,
-          "special": false,
-          "text": " ge"
-        },
-        {
-          "id": 21017,
-          "logprob": -0.019012451,
-          "special": false,
-          "text": "ometric"
-        },
-        {
-          "id": 5651,
-          "logprob": -0.028625488,
-          "special": false,
-          "text": " mean"
-        },
-        {
-          "id": 432,
-          "logprob": -0.29589844,
-          "special": false,
-          "text": " of"
-        },
-        {
-          "id": 312,
-          "logprob": -0.31591797,
-          "special": false,
-          "text": " a"
+          "id": 0,
+          "logprob": null,
+          "special": true,
+          "text": "<|endoftext|>"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
+    "generated_text": "\n   "
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_all_params.json b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_all_params.json
index 456015059ee8646cf4447f39ef5ee353e5b5cba2..dab437b9fa44fc2bc71219ee29d1d4031c25ffcc 100644
--- a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_all_params.json
+++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_all_params.json
@@ -30,7 +30,7 @@
       },
       {
         "id": 264,
-        "logprob": -0.37573242,
+        "logprob": -0.38061523,
         "special": false,
         "text": " a"
       },
@@ -42,7 +42,7 @@
       },
       {
         "id": 4480,
-        "logprob": -0.3322754,
+        "logprob": -0.26782227,
         "special": false,
         "text": " feature"
       },
@@ -78,7 +78,7 @@
       },
       {
         "id": 13,
-        "logprob": 0.0,
+        "logprob": -0.10632324,
         "special": false,
         "text": "\n"
       }
diff --git a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
index bf2dc5a1ffe7082d48cf709e653457166975d304..44ccea7111ae999b2b1736fa2025bb3c080d498b 100644
--- a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
+++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
@@ -1,130 +1,124 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "length",
-    "generated_tokens": 20,
+    "finish_reason": "eos_token",
+    "generated_tokens": 19,
     "prefill": [],
     "seed": null,
     "tokens": [
       {
         "id": 415,
-        "logprob": -0.039886475,
+        "logprob": -0.03665161,
         "special": false,
         "text": " The"
       },
       {
         "id": 12072,
-        "logprob": -0.1430664,
+        "logprob": -0.13549805,
         "special": false,
         "text": " cow"
       },
       {
         "id": 349,
-        "logprob": -0.056488037,
+        "logprob": -0.05819702,
         "special": false,
         "text": " is"
       },
       {
         "id": 6328,
-        "logprob": -0.6855469,
+        "logprob": -0.6826172,
         "special": false,
         "text": " standing"
       },
       {
         "id": 356,
-        "logprob": -0.1685791,
+        "logprob": -0.1607666,
         "special": false,
         "text": " on"
       },
       {
         "id": 272,
-        "logprob": -0.50097656,
+        "logprob": -0.5073242,
         "special": false,
         "text": " the"
       },
       {
         "id": 10305,
-        "logprob": -0.017303467,
+        "logprob": -0.016418457,
         "special": false,
         "text": " beach"
       },
       {
         "id": 304,
-        "logprob": -1.3564453,
+        "logprob": -1.3916016,
         "special": false,
         "text": " and"
       },
       {
         "id": 272,
-        "logprob": -0.017868042,
+        "logprob": -0.020217896,
         "special": false,
         "text": " the"
       },
       {
         "id": 13088,
-        "logprob": -0.0027103424,
+        "logprob": -0.0028133392,
         "special": false,
         "text": " chicken"
       },
       {
         "id": 349,
-        "logprob": -0.003156662,
+        "logprob": -0.003145218,
         "special": false,
         "text": " is"
       },
       {
         "id": 6398,
-        "logprob": -0.37304688,
+        "logprob": -0.37060547,
         "special": false,
         "text": " sitting"
       },
       {
         "id": 356,
-        "logprob": -0.034576416,
+        "logprob": -0.034851074,
         "special": false,
         "text": " on"
       },
       {
         "id": 264,
-        "logprob": -0.29418945,
+        "logprob": -0.2878418,
         "special": false,
         "text": " a"
       },
       {
         "id": 17972,
-        "logprob": -0.042877197,
+        "logprob": -0.046051025,
         "special": false,
         "text": " pile"
       },
       {
         "id": 302,
-        "logprob": -0.00028443336,
+        "logprob": -0.00028848648,
         "special": false,
         "text": " of"
       },
       {
         "id": 2445,
-        "logprob": -0.023223877,
+        "logprob": -0.025772095,
         "special": false,
         "text": " money"
       },
       {
         "id": 28723,
-        "logprob": -0.018157959,
+        "logprob": -0.018127441,
         "special": false,
         "text": "."
       },
       {
         "id": 32002,
-        "logprob": -0.00018393993,
+        "logprob": -0.00019824505,
         "special": true,
         "text": "<end_of_utterance>"
-      },
-      {
-        "id": 2,
-        "logprob": -1.1920929e-07,
-        "special": true,
-        "text": "</s>"
       }
     ],
     "top_tokens": null
diff --git a/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json
index 2007c0f23487b5ab5a013b63cb4af686bf5cabad..50ce2e31e81f5be31d6abc0ed0f6ea8a02737e01 100644
--- a/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json
+++ b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json
@@ -12,7 +12,7 @@
         },
         {
           "id": 1247,
-          "logprob": -2.3886719,
+          "logprob": -2.390625,
           "text": "User"
         },
         {
@@ -27,12 +27,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.671875,
+          "logprob": -10.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7109375,
+          "logprob": -15.828125,
           "text": "<image>"
         },
         {
@@ -42,82 +42,82 @@
         },
         {
           "id": 32000,
-          "logprob": -10.0234375,
+          "logprob": -10.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1328125,
+          "logprob": -10.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.421875,
+          "logprob": -10.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.59375,
+          "logprob": -15.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1171875,
+          "logprob": -10.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1640625,
+          "logprob": -10.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.234375,
+          "logprob": -10.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3984375,
+          "logprob": -10.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -13.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.359375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
@@ -127,12 +127,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.4765625,
+          "logprob": -10.4453125,
           "text": "<image>"
         },
         {
@@ -142,22 +142,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.34375,
+          "logprob": -17.234375,
           "text": "<image>"
         },
         {
@@ -172,12 +172,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.390625,
+          "logprob": -17.984375,
           "text": "<image>"
         },
         {
@@ -187,17 +187,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.875,
+          "logprob": -9.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7734375,
+          "logprob": -10.7578125,
           "text": "<image>"
         },
         {
@@ -207,32 +207,32 @@
         },
         {
           "id": 32000,
-          "logprob": -10.96875,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5078125,
+          "logprob": -10.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
@@ -242,127 +242,127 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7421875,
+          "logprob": -10.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3828125,
+          "logprob": -10.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.171875,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.40625,
+          "logprob": -18.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -11.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7109375,
+          "logprob": -15.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2421875,
+          "logprob": -10.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2265625,
+          "logprob": -10.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2578125,
+          "logprob": -10.265625,
           "text": "<image>"
         },
         {
@@ -372,32 +372,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7890625,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
@@ -407,27 +407,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.4453125,
+          "logprob": -15.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2109375,
+          "logprob": -10.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6796875,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3671875,
+          "logprob": -10.375,
           "text": "<image>"
         },
         {
@@ -442,27 +442,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.96875,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
@@ -482,27 +482,27 @@
         },
         {
           "id": 32000,
-          "logprob": -10.703125,
+          "logprob": -10.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.71875,
+          "logprob": -10.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
@@ -512,12 +512,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7109375,
+          "logprob": -10.7265625,
           "text": "<image>"
         },
         {
@@ -532,17 +532,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6875,
+          "logprob": -10.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -552,22 +552,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2265625,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -577,7 +577,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.21875,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
@@ -587,52 +587,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.140625,
+          "logprob": -15.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -642,72 +642,72 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8828125,
+          "logprob": -10.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1953125,
+          "logprob": -11.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -717,12 +717,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
@@ -737,27 +737,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2890625,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.7578125,
           "text": "<image>"
         },
         {
@@ -772,7 +772,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
@@ -782,12 +782,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9765625,
+          "logprob": -17.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -797,12 +797,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -812,12 +812,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
@@ -827,67 +827,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.359375,
+          "logprob": -10.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.015625,
+          "logprob": -11.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7421875,
+          "logprob": -10.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
@@ -897,17 +897,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6171875,
+          "logprob": -10.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8203125,
           "text": "<image>"
         },
         {
@@ -927,67 +927,67 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2890625,
+          "logprob": -10.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.234375,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.515625,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1328125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -997,92 +997,92 @@
         },
         {
           "id": 32000,
-          "logprob": -10.359375,
+          "logprob": -10.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6171875,
+          "logprob": -10.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1796875,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.203125,
+          "logprob": -14.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.6015625,
           "text": "<image>"
         },
         {
@@ -1092,32 +1092,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -1127,12 +1127,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
@@ -1142,7 +1142,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
@@ -1152,57 +1152,57 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9921875,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7109375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7890625,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
@@ -1212,17 +1212,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.859375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
@@ -1232,22 +1232,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.171875,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
@@ -1257,42 +1257,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.484375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.921875,
+          "logprob": -14.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.171875,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
@@ -1302,37 +1302,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1875,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.171875,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1953125,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -1342,17 +1342,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
@@ -1362,27 +1362,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
@@ -1392,32 +1392,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -1437,12 +1437,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0234375,
+          "logprob": -15.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
@@ -1452,47 +1452,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8828125,
+          "logprob": -14.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -1502,12 +1502,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -1527,12 +1527,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
@@ -1547,7 +1547,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -1562,47 +1562,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1953125,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6640625,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5703125,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
@@ -1612,57 +1612,57 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7890625,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4296875,
+          "logprob": -14.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
@@ -1672,37 +1672,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -1712,7 +1712,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -1732,12 +1732,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.109375,
+          "logprob": -15.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
@@ -1747,12 +1747,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2578125,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
@@ -1762,7 +1762,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
@@ -1772,22 +1772,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7734375,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -1797,62 +1797,62 @@
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.21875,
+          "logprob": -16.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
@@ -1862,17 +1862,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -1887,17 +1887,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -1907,27 +1907,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.1875,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.421875,
+          "logprob": -14.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -1942,12 +1942,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.859375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -1957,12 +1957,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.953125,
           "text": "<image>"
         },
         {
@@ -1977,47 +1977,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3515625,
+          "logprob": -15.34375,
           "text": "<image>"
         },
         {
@@ -2032,52 +2032,52 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
@@ -2092,27 +2092,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.296875,
           "text": "<image>"
         },
         {
@@ -2122,22 +2122,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -2147,37 +2147,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
@@ -2187,12 +2187,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
@@ -2202,42 +2202,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
@@ -2247,12 +2247,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -2262,52 +2262,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.875,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2890625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
@@ -2317,22 +2317,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -2342,17 +2342,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -2362,27 +2362,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.390625,
+          "logprob": -15.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.140625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.4921875,
+          "logprob": -15.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.921875,
           "text": "<image>"
         },
         {
@@ -2397,7 +2397,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.984375,
+          "logprob": -16.0,
           "text": "<image>"
         },
         {
@@ -2407,37 +2407,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
@@ -2447,12 +2447,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
@@ -2462,27 +2462,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -2492,77 +2492,77 @@
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.078125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
@@ -2577,27 +2577,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.921875,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
@@ -2617,12 +2617,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
@@ -2632,12 +2632,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2109375,
           "text": "<image>"
         },
         {
@@ -2657,17 +2657,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -2677,12 +2677,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
@@ -2697,7 +2697,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
@@ -2712,7 +2712,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
@@ -2727,27 +2727,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1875,
+          "logprob": -13.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.796875,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -2757,27 +2757,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6640625,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7890625,
+          "logprob": -15.7734375,
           "text": "<image>"
         },
         {
@@ -2787,7 +2787,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
@@ -2797,17 +2797,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.21875,
+          "logprob": -16.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -2817,27 +2817,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
@@ -2847,12 +2847,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
@@ -2862,87 +2862,87 @@
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.15625,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.46875,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.515625,
+          "logprob": -10.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.015625,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.375,
+          "logprob": -16.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
@@ -2952,52 +2952,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2109375,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.9375,
+          "logprob": -9.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2734375,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
@@ -3012,72 +3012,72 @@
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8359375,
+          "logprob": -14.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5859375,
+          "logprob": -10.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5390625,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2578125,
+          "logprob": -15.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1171875,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -3087,7 +3087,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -11.0078125,
           "text": "<image>"
         },
         {
@@ -3097,22 +3097,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
@@ -3122,7 +3122,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
@@ -3132,17 +3132,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
@@ -3152,12 +3152,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -3167,37 +3167,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.6953125,
+          "logprob": -9.7109375,
           "text": "<image>"
         },
         {
@@ -3207,27 +3207,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3203125,
+          "logprob": -14.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
@@ -3242,7 +3242,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
@@ -3257,22 +3257,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
@@ -3282,7 +3282,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5703125,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
@@ -3297,17 +3297,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
@@ -3317,17 +3317,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.03125,
+          "logprob": -10.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7890625,
+          "logprob": -13.8125,
           "text": "<image>"
         },
         {
@@ -3337,42 +3337,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
@@ -3382,37 +3382,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.546875,
+          "logprob": -16.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4765625,
+          "logprob": -14.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
@@ -3422,7 +3422,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -3432,12 +3432,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
@@ -3447,117 +3447,117 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7890625,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0234375,
+          "logprob": -15.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9453125,
+          "logprob": -14.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
@@ -3567,37 +3567,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9609375,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.140625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4609375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.609375,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
@@ -3607,7 +3607,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
@@ -3632,12 +3632,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.046875,
+          "logprob": -17.0625,
           "text": "<image>"
         },
         {
@@ -3647,12 +3647,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
@@ -3662,42 +3662,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9375,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.46875,
+          "logprob": -10.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.140625,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
@@ -3707,52 +3707,52 @@
         },
         {
           "id": 32000,
-          "logprob": -16.90625,
+          "logprob": -16.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1328125,
+          "logprob": -14.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -3762,17 +3762,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.09375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
@@ -3782,32 +3782,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8359375,
+          "logprob": -14.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1640625,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.4921875,
           "text": "<image>"
         },
         {
@@ -3817,7 +3817,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
@@ -3827,32 +3827,32 @@
         },
         {
           "id": 32000,
-          "logprob": -16.328125,
+          "logprob": -16.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9921875,
+          "logprob": -14.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.40625,
+          "logprob": -15.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
@@ -3867,7 +3867,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
@@ -3882,82 +3882,82 @@
         },
         {
           "id": 32000,
-          "logprob": -16.890625,
+          "logprob": -16.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.75,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0546875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
@@ -3967,77 +3967,77 @@
         },
         {
           "id": 32000,
-          "logprob": -13.640625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.15625,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.25,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6484375,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0078125,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.9765625,
           "text": "<image>"
         },
         {
@@ -4047,17 +4047,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
@@ -4072,12 +4072,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7578125,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
@@ -4087,12 +4087,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -4107,17 +4107,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
@@ -4127,37 +4127,37 @@
         },
         {
           "id": 32000,
-          "logprob": -15.953125,
+          "logprob": -15.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
@@ -4167,47 +4167,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
@@ -4217,17 +4217,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0703125,
+          "logprob": -15.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
@@ -4242,27 +4242,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3515625,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
@@ -4287,12 +4287,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -4302,22 +4302,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.078125,
+          "logprob": -14.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.03125,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -4327,42 +4327,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.46875,
+          "logprob": -14.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.703125,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.296875,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8828125,
+          "logprob": -14.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1796875,
+          "logprob": -14.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -4377,12 +4377,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7109375,
           "text": "<image>"
         },
         {
@@ -4392,32 +4392,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -4427,12 +4427,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -4442,37 +4442,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4609375,
+          "logprob": -14.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.65625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
@@ -4492,22 +4492,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -4517,32 +4517,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -4552,27 +4552,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -4582,12 +4582,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0703125,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.984375,
           "text": "<image>"
         },
         {
@@ -4597,7 +4597,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.2109375,
           "text": "<image>"
         },
         {
@@ -4607,42 +4607,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.328125,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
@@ -4657,17 +4657,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -4682,17 +4682,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2578125,
+          "logprob": -15.2265625,
           "text": "<image>"
         },
         {
@@ -4702,17 +4702,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0078125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3828125,
+          "logprob": -15.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
@@ -4722,7 +4722,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0703125,
+          "logprob": -11.0625,
           "text": "<image>"
         },
         {
@@ -4732,7 +4732,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -4742,27 +4742,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.15625,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
@@ -4777,22 +4777,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
@@ -4802,22 +4802,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
@@ -4827,7 +4827,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
@@ -4837,127 +4837,127 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.296875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2109375,
+          "logprob": -15.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
@@ -4967,7 +4967,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.5234375,
           "text": "<image>"
         },
         {
@@ -4977,27 +4977,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
@@ -5007,22 +5007,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0390625,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
@@ -5032,12 +5032,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
@@ -5047,32 +5047,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4765625,
+          "logprob": -14.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -13.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
@@ -5082,22 +5082,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.640625,
+          "logprob": -14.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
@@ -5107,7 +5107,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.4375,
+          "logprob": -15.4296875,
           "text": "<image>"
         },
         {
@@ -5117,32 +5117,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.21875,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
@@ -5152,17 +5152,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
@@ -5172,22 +5172,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8203125,
           "text": "<image>"
         },
         {
@@ -5197,7 +5197,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -5207,7 +5207,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1328125,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
@@ -5232,7 +5232,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
@@ -5242,7 +5242,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.15625,
+          "logprob": -11.1484375,
           "text": "<image>"
         },
         {
@@ -5252,17 +5252,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.09375,
+          "logprob": -13.0859375,
           "text": "<image>"
         },
         {
@@ -5272,22 +5272,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9609375,
+          "logprob": -13.9765625,
           "text": "<image>"
         },
         {
@@ -5297,7 +5297,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
@@ -5307,22 +5307,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.234375,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
@@ -5332,127 +5332,127 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.640625,
+          "logprob": -15.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.609375,
+          "logprob": -13.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
@@ -5462,7 +5462,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
@@ -5482,12 +5482,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -5497,17 +5497,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6796875,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -5522,27 +5522,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
@@ -5552,17 +5552,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5078125,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -5572,32 +5572,32 @@
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.40625,
+          "logprob": -14.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.296875,
+          "logprob": -15.3203125,
           "text": "<image>"
         },
         {
@@ -5607,12 +5607,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
@@ -5622,7 +5622,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
@@ -5632,12 +5632,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
@@ -5647,12 +5647,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
@@ -5662,52 +5662,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2734375,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
@@ -5717,37 +5717,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.546875,
+          "logprob": -14.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -5757,12 +5757,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.515625,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.078125,
           "text": "<image>"
         },
         {
@@ -5772,72 +5772,72 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5703125,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.125,
+          "logprob": -14.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
@@ -5847,97 +5847,97 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4140625,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1328125,
           "text": "<image>"
         },
         {
@@ -5947,27 +5947,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
@@ -5982,27 +5982,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.78125,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
@@ -6012,17 +6012,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
@@ -6032,37 +6032,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -6072,112 +6072,112 @@
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5390625,
+          "logprob": -15.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2109375,
+          "logprob": -14.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.390625,
+          "logprob": -14.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.984375,
+          "logprob": -14.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -6187,27 +6187,27 @@
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
@@ -6227,7 +6227,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5546875,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
@@ -6237,72 +6237,72 @@
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.171875,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
@@ -6312,17 +6312,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0625,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
@@ -6332,12 +6332,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
@@ -6347,42 +6347,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -6392,12 +6392,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -14.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
@@ -6407,17 +6407,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9765625,
+          "logprob": -14.96875,
           "text": "<image>"
         },
         {
@@ -6427,32 +6427,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.703125,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -6462,17 +6462,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -6482,7 +6482,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
@@ -6492,67 +6492,67 @@
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9375,
+          "logprob": -14.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4609375,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -6562,7 +6562,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7578125,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
@@ -6572,12 +6572,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.140625,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
@@ -6587,7 +6587,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
@@ -6597,27 +6597,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
@@ -6627,17 +6627,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -6647,22 +6647,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -6672,32 +6672,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
@@ -6712,7 +6712,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.109375,
+          "logprob": -11.1015625,
           "text": "<image>"
         },
         {
@@ -6732,67 +6732,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
@@ -6802,17 +6802,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.765625,
+          "logprob": -13.7578125,
           "text": "<image>"
         },
         {
@@ -6822,37 +6822,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.53125,
+          "logprob": -14.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
@@ -6867,62 +6867,62 @@
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.21875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
@@ -6937,47 +6937,47 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.03125,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
@@ -7002,47 +7002,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.765625,
+          "logprob": -15.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9140625,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -7052,92 +7052,92 @@
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.25,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
@@ -7147,12 +7147,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -7187,22 +7187,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6640625,
+          "logprob": -13.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -7227,57 +7227,57 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
@@ -7287,7 +7287,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
@@ -7297,7 +7297,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -7312,37 +7312,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5625,
+          "logprob": -14.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7109375,
+          "logprob": -10.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -7357,32 +7357,32 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.296875,
+          "logprob": -13.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -7397,27 +7397,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -7427,27 +7427,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
@@ -7457,7 +7457,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
@@ -7467,7 +7467,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -7482,37 +7482,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.03125,
+          "logprob": -10.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.671875,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
@@ -7522,32 +7522,32 @@
         },
         {
           "id": 32000,
-          "logprob": -15.1796875,
+          "logprob": -15.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6484375,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -7557,67 +7557,67 @@
         },
         {
           "id": 32000,
-          "logprob": -14.390625,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.453125,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7890625,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3828125,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -7627,12 +7627,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
@@ -7642,17 +7642,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.015625,
+          "logprob": -13.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -7662,7 +7662,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
@@ -7677,7 +7677,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
@@ -7687,7 +7687,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -7697,7 +7697,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
@@ -7707,12 +7707,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.21875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
@@ -7722,12 +7722,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
@@ -7737,32 +7737,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6640625,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -15.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -7772,7 +7772,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -7782,27 +7782,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1015625,
+          "logprob": -14.109375,
           "text": "<image>"
         },
         {
@@ -7812,17 +7812,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -7832,27 +7832,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.21875,
           "text": "<image>"
         },
         {
@@ -7862,52 +7862,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.1875,
+          "logprob": -15.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.515625,
+          "logprob": -13.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -7917,12 +7917,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5546875,
+          "logprob": -14.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.4140625,
           "text": "<image>"
         },
         {
@@ -7932,7 +7932,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
@@ -7947,7 +7947,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -7967,27 +7967,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1875,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.359375,
+          "logprob": -13.3359375,
           "text": "<image>"
         },
         {
@@ -7997,7 +7997,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -8012,32 +8012,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
@@ -8047,42 +8047,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
@@ -8092,17 +8092,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -8117,52 +8117,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
@@ -8172,22 +8172,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1640625,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
@@ -8197,7 +8197,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
@@ -8207,12 +8207,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
@@ -8232,12 +8232,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
@@ -8247,22 +8247,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.15625,
+          "logprob": -14.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
@@ -8272,47 +8272,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6015625,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.078125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
@@ -8322,17 +8322,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
@@ -8342,7 +8342,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -8357,42 +8357,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
@@ -8402,17 +8402,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7890625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
@@ -8422,7 +8422,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
@@ -8437,17 +8437,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -8457,17 +8457,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.109375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
@@ -8477,27 +8477,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.28125,
+          "logprob": -15.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
@@ -8507,12 +8507,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0078125,
+          "logprob": -14.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
@@ -8527,32 +8527,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9765625,
+          "logprob": -13.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.890625,
+          "logprob": -14.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -8562,7 +8562,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
@@ -8572,7 +8572,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -8582,7 +8582,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -8592,7 +8592,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
@@ -8602,7 +8602,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -8612,7 +8612,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -8622,22 +8622,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
@@ -8647,7 +8647,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.765625,
           "text": "<image>"
         },
         {
@@ -8662,37 +8662,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
@@ -8702,22 +8702,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
@@ -8727,112 +8727,112 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.96875,
+          "logprob": -15.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5859375,
+          "logprob": -10.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0078125,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3359375,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -8852,22 +8852,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
@@ -8877,12 +8877,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
@@ -8892,7 +8892,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.9140625,
           "text": "<image>"
         },
         {
@@ -8912,17 +8912,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
@@ -8932,17 +8932,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.140625,
+          "logprob": -11.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.0859375,
           "text": "<image>"
         },
         {
@@ -8952,12 +8952,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.578125,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -8972,22 +8972,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
@@ -9002,7 +9002,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.5703125,
           "text": "<image>"
         },
         {
@@ -9012,37 +9012,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1796875,
+          "logprob": -11.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
@@ -9052,17 +9052,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3359375,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
@@ -9072,17 +9072,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6875,
+          "logprob": -14.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -9107,12 +9107,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -9127,27 +9127,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.875,
           "text": "<image>"
         },
         {
@@ -9167,12 +9167,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
@@ -9202,22 +9202,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
@@ -9227,62 +9227,62 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.625,
+          "logprob": -9.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8515625,
+          "logprob": -10.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.015625,
+          "logprob": -14.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -9292,22 +9292,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
@@ -9317,7 +9317,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.21875,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
@@ -9327,7 +9327,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
@@ -9337,12 +9337,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6953125,
+          "logprob": -15.671875,
           "text": "<image>"
         },
         {
@@ -9357,7 +9357,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -9367,22 +9367,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9453125,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
@@ -9397,7 +9397,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
@@ -9407,7 +9407,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
@@ -9422,7 +9422,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -9437,27 +9437,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
@@ -9467,67 +9467,67 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1796875,
+          "logprob": -14.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.046875,
+          "logprob": -17.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -15.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.625,
+          "logprob": -10.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
@@ -9537,37 +9537,37 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9296875,
+          "logprob": -10.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
@@ -9577,27 +9577,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2421875,
+          "logprob": -15.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
@@ -9607,7 +9607,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -9627,32 +9627,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
@@ -9662,32 +9662,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.125,
           "text": "<image>"
         },
         {
@@ -9697,67 +9697,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5546875,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3984375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8203125,
           "text": "<image>"
         },
         {
@@ -9767,37 +9767,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.984375,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
@@ -9807,32 +9807,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8125,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.5,
+          "logprob": -16.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1171875,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
@@ -9842,67 +9842,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6015625,
+          "logprob": -10.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
@@ -9912,7 +9912,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.921875,
+          "logprob": -13.90625,
           "text": "<image>"
         },
         {
@@ -9927,12 +9927,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
@@ -9942,7 +9942,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
@@ -9957,37 +9957,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -13.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.125,
+          "logprob": -15.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.046875,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -10002,62 +10002,62 @@
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -14.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.890625,
+          "logprob": -13.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3203125,
+          "logprob": -15.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
@@ -10067,52 +10067,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9609375,
+          "logprob": -14.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.171875,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -10142,17 +10142,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3125,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
@@ -10162,27 +10162,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.515625,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -10192,42 +10192,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0078125,
+          "logprob": -14.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.21875,
+          "logprob": -15.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.84375,
+          "logprob": -17.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.515625,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
@@ -10237,17 +10237,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4609375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.84375,
+          "logprob": -14.828125,
           "text": "<image>"
         },
         {
@@ -10257,22 +10257,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
@@ -10282,37 +10282,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.890625,
+          "logprob": -15.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
@@ -10322,7 +10322,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
@@ -10332,22 +10332,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
@@ -10357,12 +10357,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.734375,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -10372,12 +10372,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
@@ -10387,12 +10387,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
@@ -10412,22 +10412,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
@@ -10447,7 +10447,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
@@ -10457,27 +10457,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.328125,
+          "logprob": -15.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
@@ -10487,17 +10487,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.421875,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
@@ -10507,12 +10507,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -10527,12 +10527,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.375,
+          "logprob": -10.3828125,
           "text": "<image>"
         },
         {
@@ -10542,7 +10542,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
@@ -10552,7 +10552,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -10562,12 +10562,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
@@ -10577,12 +10577,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -10597,7 +10597,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
@@ -10607,17 +10607,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.59375,
+          "logprob": -16.578125,
           "text": "<image>"
         },
         {
@@ -10627,7 +10627,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -10642,22 +10642,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -10667,12 +10667,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4375,
           "text": "<image>"
         },
         {
@@ -10682,22 +10682,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.8203125,
+          "logprob": -15.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
@@ -10707,42 +10707,42 @@
         },
         {
           "id": 32000,
-          "logprob": -10.109375,
+          "logprob": -10.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.015625,
+          "logprob": -16.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.703125,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
@@ -10752,32 +10752,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6640625,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6640625,
+          "logprob": -15.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.046875,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
@@ -10787,7 +10787,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.953125,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -10802,17 +10802,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
@@ -10822,77 +10822,77 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
@@ -10902,12 +10902,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.734375,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -10917,12 +10917,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -10932,17 +10932,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
@@ -10952,7 +10952,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.6953125,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
@@ -10962,47 +10962,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4140625,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9453125,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -11012,77 +11012,77 @@
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.34375,
+          "logprob": -17.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1953125,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -11092,7 +11092,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -11102,17 +11102,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
@@ -11122,17 +11122,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
@@ -11142,17 +11142,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.6171875,
           "text": "<image>"
         },
         {
@@ -11162,52 +11162,52 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -11217,7 +11217,7 @@
         },
         {
           "id": 32000,
-          "logprob": -17.109375,
+          "logprob": -17.09375,
           "text": "<image>"
         },
         {
@@ -11227,17 +11227,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
@@ -11247,47 +11247,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.09375,
+          "logprob": -15.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.8515625,
+          "logprob": -15.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
@@ -11297,12 +11297,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -11312,12 +11312,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -11327,12 +11327,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7109375,
+          "logprob": -12.6796875,
           "text": "<image>"
         },
         {
@@ -11342,7 +11342,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
@@ -11352,22 +11352,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2265625,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9296875,
+          "logprob": -10.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.53125,
           "text": "<image>"
         },
         {
@@ -11377,12 +11377,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.953125,
+          "logprob": -17.0,
           "text": "<image>"
         },
         {
@@ -11392,12 +11392,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -11412,12 +11412,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -11427,12 +11427,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.421875,
           "text": "<image>"
         },
         {
@@ -11447,7 +11447,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -11457,17 +11457,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.234375,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -11477,37 +11477,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.203125,
+          "logprob": -13.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.078125,
+          "logprob": -17.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.25,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -11517,17 +11517,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.0625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
@@ -11537,17 +11537,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -11562,22 +11562,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
@@ -11587,7 +11587,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -11597,7 +11597,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.328125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
@@ -11607,22 +11607,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.84375,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2109375,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9375,
           "text": "<image>"
         },
         {
@@ -11632,27 +11632,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6796875,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.65625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.0703125,
           "text": "<image>"
         },
         {
@@ -11672,7 +11672,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
@@ -11687,22 +11687,22 @@
         },
         {
           "id": 32000,
-          "logprob": -16.109375,
+          "logprob": -16.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -11712,57 +11712,57 @@
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.21875,
+          "logprob": -15.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8984375,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.4375,
+          "logprob": -17.453125,
           "text": "<image>"
         },
         {
@@ -11777,7 +11777,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
@@ -11792,12 +11792,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2421875,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -11812,57 +11812,57 @@
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7421875,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.71875,
+          "logprob": -14.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0078125,
+          "logprob": -11.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.46875,
+          "logprob": -16.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
@@ -11872,7 +11872,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -11887,12 +11887,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.359375,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
@@ -11907,27 +11907,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
@@ -11942,7 +11942,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.046875,
           "text": "<image>"
         },
         {
@@ -11952,7 +11952,7 @@
         },
         {
           "id": 32000,
-          "logprob": -17.21875,
+          "logprob": -17.203125,
           "text": "<image>"
         },
         {
@@ -11962,17 +11962,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6171875,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -11982,12 +11982,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.03125,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -11997,7 +11997,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.890625,
+          "logprob": -15.9296875,
           "text": "<image>"
         },
         {
@@ -12022,22 +12022,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7578125,
+          "logprob": -14.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -12047,7 +12047,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.7265625,
+          "logprob": -10.734375,
           "text": "<image>"
         },
         {
@@ -12057,22 +12057,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
@@ -12082,7 +12082,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
@@ -12097,7 +12097,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
@@ -12107,7 +12107,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.265625,
+          "logprob": -15.2578125,
           "text": "<image>"
         },
         {
@@ -12122,7 +12122,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
@@ -12137,12 +12137,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
@@ -12152,7 +12152,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
@@ -12162,27 +12162,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3828125,
+          "logprob": -14.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -12192,27 +12192,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.25,
+          "logprob": -16.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.8125,
+          "logprob": -16.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.359375,
           "text": "<image>"
         },
         {
@@ -12222,12 +12222,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
@@ -12242,77 +12242,77 @@
         },
         {
           "id": 32000,
-          "logprob": -17.5625,
+          "logprob": -17.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.75,
+          "logprob": -16.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
@@ -12322,7 +12322,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
@@ -12342,7 +12342,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
@@ -12352,12 +12352,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -12367,22 +12367,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
@@ -12402,12 +12402,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
@@ -12417,27 +12417,27 @@
         },
         {
           "id": 32000,
-          "logprob": -16.640625,
+          "logprob": -16.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -12447,12 +12447,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
@@ -12462,12 +12462,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5390625,
+          "logprob": -14.546875,
           "text": "<image>"
         },
         {
@@ -12477,12 +12477,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6796875,
+          "logprob": -13.65625,
           "text": "<image>"
         },
         {
@@ -12492,77 +12492,77 @@
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.625,
+          "logprob": -17.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.75,
+          "logprob": -17.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5703125,
+          "logprob": -10.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
@@ -12572,42 +12572,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3046875,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9375,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -12617,27 +12617,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -12647,42 +12647,42 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0546875,
+          "logprob": -15.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.84375,
+          "logprob": -16.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9296875,
+          "logprob": -14.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
@@ -12692,27 +12692,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.109375,
+          "logprob": -10.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -12722,12 +12722,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
@@ -12737,7 +12737,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
@@ -12747,12 +12747,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8671875,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -12767,32 +12767,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.453125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.890625,
+          "logprob": -16.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -12807,37 +12807,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.6875,
+          "logprob": -17.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
@@ -12847,22 +12847,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.203125,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
@@ -12872,7 +12872,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -12882,12 +12882,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
@@ -12897,12 +12897,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -12912,22 +12912,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.46875,
+          "logprob": -16.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
@@ -12937,22 +12937,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.109375,
+          "logprob": -13.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.34375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
@@ -12962,17 +12962,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -12982,22 +12982,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
@@ -13007,22 +13007,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6484375,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -13032,7 +13032,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.2265625,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
@@ -13042,7 +13042,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
@@ -13057,52 +13057,52 @@
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5,
+          "logprob": -15.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7734375,
+          "logprob": -13.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -13112,27 +13112,27 @@
         },
         {
           "id": 32000,
-          "logprob": -16.390625,
+          "logprob": -16.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8984375,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9921875,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
@@ -13147,7 +13147,7 @@
         },
         {
           "id": 32000,
-          "logprob": -16.046875,
+          "logprob": -16.03125,
           "text": "<image>"
         },
         {
@@ -13157,12 +13157,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6484375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3203125,
+          "logprob": -15.3359375,
           "text": "<image>"
         },
         {
@@ -13172,42 +13172,42 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.4765625,
+          "logprob": -16.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
@@ -13217,12 +13217,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.8359375,
+          "logprob": -15.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.65625,
+          "logprob": -17.703125,
           "text": "<image>"
         },
         {
@@ -13232,22 +13232,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -13257,17 +13257,17 @@
         },
         {
           "id": 32000,
-          "logprob": -16.6875,
+          "logprob": -16.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.90625,
           "text": "<image>"
         },
         {
@@ -13277,12 +13277,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.203125,
+          "logprob": -15.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
@@ -13292,27 +13292,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.8125,
+          "logprob": -16.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8671875,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -13322,12 +13322,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -13337,7 +13337,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.765625,
+          "logprob": -13.7578125,
           "text": "<image>"
         },
         {
@@ -13352,17 +13352,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -13372,22 +13372,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -13397,12 +13397,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -13412,27 +13412,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.703125,
+          "logprob": -18.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6484375,
+          "logprob": -15.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.703125,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -13447,17 +13447,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
@@ -13467,42 +13467,42 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3515625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.15625,
+          "logprob": -14.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -13.8515625,
           "text": "<image>"
         },
         {
@@ -13512,12 +13512,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
@@ -13527,12 +13527,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -13542,12 +13542,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
@@ -13557,27 +13557,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
@@ -13587,27 +13587,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.671875,
+          "logprob": -16.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
@@ -13617,7 +13617,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.9453125,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
@@ -13627,82 +13627,82 @@
         },
         {
           "id": 32000,
-          "logprob": -15.203125,
+          "logprob": -15.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.796875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.484375,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6328125,
+          "logprob": -14.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.75,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3125,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -13712,12 +13712,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.484375,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
@@ -13732,17 +13732,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.53125,
+          "logprob": -16.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
@@ -13752,17 +13752,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.03125,
+          "logprob": -15.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.484375,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
@@ -13777,37 +13777,37 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.0,
+          "logprob": -17.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
@@ -13822,12 +13822,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
@@ -13837,27 +13837,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.125,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9140625,
+          "logprob": -14.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.546875,
+          "logprob": -15.6875,
           "text": "<image>"
         },
         {
@@ -13867,22 +13867,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.109375,
+          "logprob": -14.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -13892,7 +13892,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7734375,
+          "logprob": -14.765625,
           "text": "<image>"
         },
         {
@@ -13902,12 +13902,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -13917,17 +13917,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
@@ -13942,27 +13942,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -13972,47 +13972,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -19.65625,
+          "logprob": -19.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
@@ -14022,37 +14022,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.859375,
+          "logprob": -13.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
@@ -14062,47 +14062,47 @@
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8984375,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.234375,
+          "logprob": -16.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2109375,
+          "logprob": -15.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5703125,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
@@ -14112,47 +14112,47 @@
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -14.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
@@ -14162,87 +14162,87 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0703125,
+          "logprob": -15.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0390625,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5546875,
+          "logprob": -12.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.375,
+          "logprob": -16.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8828125,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6171875,
+          "logprob": -14.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
@@ -14257,22 +14257,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5625,
+          "logprob": -14.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
@@ -14282,62 +14282,62 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.953125,
+          "logprob": -14.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6484375,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
@@ -14352,17 +14352,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.1953125,
+          "logprob": -15.1875,
           "text": "<image>"
         },
         {
@@ -14372,17 +14372,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0546875,
+          "logprob": -15.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.59375,
+          "logprob": -15.5703125,
           "text": "<image>"
         },
         {
@@ -14392,22 +14392,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.90625,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.671875,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
@@ -14417,17 +14417,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.2265625,
+          "logprob": -15.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6875,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
@@ -14437,37 +14437,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.515625,
+          "logprob": -16.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3984375,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
@@ -14482,17 +14482,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
@@ -14502,67 +14502,67 @@
         },
         {
           "id": 32000,
-          "logprob": -14.734375,
+          "logprob": -14.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3984375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.578125,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.578125,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.421875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.34375,
+          "logprob": -14.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.890625,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
@@ -14572,7 +14572,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
@@ -14582,12 +14582,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
@@ -14602,17 +14602,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5078125,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
@@ -14622,22 +14622,22 @@
         },
         {
           "id": 32000,
-          "logprob": -17.3125,
+          "logprob": -16.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5234375,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.625,
+          "logprob": -17.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -14647,17 +14647,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.9140625,
+          "logprob": -16.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.65625,
+          "logprob": -16.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -14667,17 +14667,17 @@
         },
         {
           "id": 368,
-          "logprob": -0.19726562,
+          "logprob": -0.19604492,
           "text": "you"
         },
         {
           "id": 1912,
-          "logprob": -1.4990234,
+          "logprob": -1.5058594,
           "text": "tell"
         },
         {
           "id": 528,
-          "logprob": -0.31152344,
+          "logprob": -0.31030273,
           "text": "me"
         },
         {
@@ -14692,22 +14692,22 @@
         },
         {
           "id": 2485,
-          "logprob": -0.9941406,
+          "logprob": -0.9975586,
           "text": "short"
         },
         {
           "id": 2838,
-          "logprob": -0.46118164,
+          "logprob": -0.4633789,
           "text": "story"
         },
         {
           "id": 2818,
-          "logprob": -3.3183594,
+          "logprob": -3.3144531,
           "text": "based"
         },
         {
           "id": 356,
-          "logprob": -0.029129028,
+          "logprob": -0.029037476,
           "text": "on"
         },
         {
@@ -14717,12 +14717,12 @@
         },
         {
           "id": 3469,
-          "logprob": -0.29052734,
+          "logprob": -0.2890625,
           "text": "image"
         },
         {
           "id": 28804,
-          "logprob": -0.43188477,
+          "logprob": -0.42895508,
           "text": "?"
         }
       ],
@@ -14730,13 +14730,13 @@
       "tokens": [
         {
           "id": 13,
-          "logprob": -0.0076828003,
+          "logprob": -0.007621765,
           "special": false,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.20092773,
+          "logprob": -0.20812988,
           "special": false,
           "text": "\n"
         },
@@ -14748,43 +14748,43 @@
         },
         {
           "id": 3714,
-          "logprob": -0.20861816,
+          "logprob": -0.20825195,
           "special": false,
           "text": " upon"
         },
         {
           "id": 264,
-          "logprob": -0.0017719269,
+          "logprob": -0.0017709732,
           "special": false,
           "text": " a"
         },
         {
           "id": 727,
-          "logprob": -0.011909485,
+          "logprob": -0.011932373,
           "special": false,
           "text": " time"
         },
         {
           "id": 28725,
-          "logprob": -0.17529297,
+          "logprob": -0.17297363,
           "special": false,
           "text": ","
         },
         {
           "id": 736,
-          "logprob": -0.9082031,
+          "logprob": -0.9057617,
           "special": false,
           "text": " there"
         },
         {
           "id": 403,
-          "logprob": -0.057525635,
+          "logprob": -0.05758667,
           "special": false,
           "text": " was"
         },
         {
           "id": 264,
-          "logprob": -0.009651184,
+          "logprob": -0.00970459,
           "special": false,
           "text": " a"
         }
@@ -14806,7 +14806,7 @@
         },
         {
           "id": 1247,
-          "logprob": -2.3886719,
+          "logprob": -2.390625,
           "text": "User"
         },
         {
@@ -14821,12 +14821,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.671875,
+          "logprob": -10.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7109375,
+          "logprob": -15.828125,
           "text": "<image>"
         },
         {
@@ -14836,82 +14836,82 @@
         },
         {
           "id": 32000,
-          "logprob": -10.0234375,
+          "logprob": -10.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1328125,
+          "logprob": -10.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.421875,
+          "logprob": -10.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.59375,
+          "logprob": -15.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1171875,
+          "logprob": -10.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1640625,
+          "logprob": -10.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.234375,
+          "logprob": -10.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3984375,
+          "logprob": -10.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -13.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.359375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
@@ -14921,12 +14921,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.4765625,
+          "logprob": -10.4453125,
           "text": "<image>"
         },
         {
@@ -14936,22 +14936,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.34375,
+          "logprob": -17.234375,
           "text": "<image>"
         },
         {
@@ -14966,12 +14966,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.390625,
+          "logprob": -17.984375,
           "text": "<image>"
         },
         {
@@ -14981,17 +14981,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.875,
+          "logprob": -9.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7734375,
+          "logprob": -10.7578125,
           "text": "<image>"
         },
         {
@@ -15001,32 +15001,32 @@
         },
         {
           "id": 32000,
-          "logprob": -10.96875,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5078125,
+          "logprob": -10.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
@@ -15036,127 +15036,127 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7421875,
+          "logprob": -10.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3828125,
+          "logprob": -10.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.171875,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.40625,
+          "logprob": -18.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -11.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7109375,
+          "logprob": -15.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2421875,
+          "logprob": -10.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2265625,
+          "logprob": -10.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2578125,
+          "logprob": -10.265625,
           "text": "<image>"
         },
         {
@@ -15166,32 +15166,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7890625,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
@@ -15201,27 +15201,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.4453125,
+          "logprob": -15.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2109375,
+          "logprob": -10.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6796875,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3671875,
+          "logprob": -10.375,
           "text": "<image>"
         },
         {
@@ -15236,27 +15236,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.96875,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
@@ -15276,27 +15276,27 @@
         },
         {
           "id": 32000,
-          "logprob": -10.703125,
+          "logprob": -10.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.71875,
+          "logprob": -10.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
@@ -15306,12 +15306,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7109375,
+          "logprob": -10.7265625,
           "text": "<image>"
         },
         {
@@ -15326,17 +15326,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6875,
+          "logprob": -10.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -15346,22 +15346,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2265625,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -15371,7 +15371,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.21875,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
@@ -15381,52 +15381,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.140625,
+          "logprob": -15.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -15436,72 +15436,72 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8828125,
+          "logprob": -10.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1953125,
+          "logprob": -11.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -15511,12 +15511,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
@@ -15531,27 +15531,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2890625,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.7578125,
           "text": "<image>"
         },
         {
@@ -15566,7 +15566,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
@@ -15576,12 +15576,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9765625,
+          "logprob": -17.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -15591,12 +15591,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -15606,12 +15606,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
@@ -15621,67 +15621,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.359375,
+          "logprob": -10.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.015625,
+          "logprob": -11.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7421875,
+          "logprob": -10.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
@@ -15691,17 +15691,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6171875,
+          "logprob": -10.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8203125,
           "text": "<image>"
         },
         {
@@ -15721,67 +15721,67 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2890625,
+          "logprob": -10.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.234375,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.515625,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1328125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -15791,92 +15791,92 @@
         },
         {
           "id": 32000,
-          "logprob": -10.359375,
+          "logprob": -10.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6171875,
+          "logprob": -10.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1796875,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.203125,
+          "logprob": -14.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.6015625,
           "text": "<image>"
         },
         {
@@ -15886,32 +15886,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -15921,12 +15921,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
@@ -15936,7 +15936,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
@@ -15946,57 +15946,57 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9921875,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7109375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7890625,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
@@ -16006,17 +16006,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.859375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
@@ -16026,22 +16026,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.171875,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
@@ -16051,42 +16051,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.484375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.921875,
+          "logprob": -14.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.171875,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
@@ -16096,37 +16096,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1875,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.171875,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1953125,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -16136,17 +16136,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
@@ -16156,27 +16156,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
@@ -16186,32 +16186,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -16231,12 +16231,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0234375,
+          "logprob": -15.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
@@ -16246,47 +16246,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8828125,
+          "logprob": -14.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -16296,12 +16296,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -16321,12 +16321,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
@@ -16341,7 +16341,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -16356,47 +16356,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1953125,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6640625,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5703125,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
@@ -16406,57 +16406,57 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7890625,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4296875,
+          "logprob": -14.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
@@ -16466,37 +16466,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -16506,7 +16506,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -16526,12 +16526,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.109375,
+          "logprob": -15.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
@@ -16541,12 +16541,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2578125,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
@@ -16556,7 +16556,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
@@ -16566,22 +16566,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7734375,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -16591,62 +16591,62 @@
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.21875,
+          "logprob": -16.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
@@ -16656,17 +16656,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -16681,17 +16681,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -16701,27 +16701,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.1875,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.421875,
+          "logprob": -14.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -16736,12 +16736,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.859375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -16751,12 +16751,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.953125,
           "text": "<image>"
         },
         {
@@ -16771,47 +16771,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3515625,
+          "logprob": -15.34375,
           "text": "<image>"
         },
         {
@@ -16826,52 +16826,52 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
@@ -16886,27 +16886,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.296875,
           "text": "<image>"
         },
         {
@@ -16916,22 +16916,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -16941,37 +16941,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
@@ -16981,12 +16981,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
@@ -16996,42 +16996,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
@@ -17041,12 +17041,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -17056,52 +17056,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.875,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2890625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
@@ -17111,22 +17111,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -17136,17 +17136,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -17156,27 +17156,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.390625,
+          "logprob": -15.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.140625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.4921875,
+          "logprob": -15.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.921875,
           "text": "<image>"
         },
         {
@@ -17191,7 +17191,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.984375,
+          "logprob": -16.0,
           "text": "<image>"
         },
         {
@@ -17201,37 +17201,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
@@ -17241,12 +17241,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
@@ -17256,27 +17256,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -17286,77 +17286,77 @@
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.078125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
@@ -17371,27 +17371,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.921875,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
@@ -17411,12 +17411,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
@@ -17426,12 +17426,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2109375,
           "text": "<image>"
         },
         {
@@ -17451,17 +17451,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -17471,12 +17471,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
@@ -17491,7 +17491,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
@@ -17506,7 +17506,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
@@ -17521,27 +17521,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1875,
+          "logprob": -13.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.796875,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -17551,27 +17551,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6640625,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7890625,
+          "logprob": -15.7734375,
           "text": "<image>"
         },
         {
@@ -17581,7 +17581,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
@@ -17591,17 +17591,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.21875,
+          "logprob": -16.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -17611,27 +17611,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
@@ -17641,12 +17641,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
@@ -17656,87 +17656,87 @@
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.15625,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.46875,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.515625,
+          "logprob": -10.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.015625,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.375,
+          "logprob": -16.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
@@ -17746,52 +17746,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2109375,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.9375,
+          "logprob": -9.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2734375,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
@@ -17806,72 +17806,72 @@
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8359375,
+          "logprob": -14.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5859375,
+          "logprob": -10.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5390625,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2578125,
+          "logprob": -15.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1171875,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -17881,7 +17881,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -11.0078125,
           "text": "<image>"
         },
         {
@@ -17891,22 +17891,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
@@ -17916,7 +17916,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
@@ -17926,17 +17926,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
@@ -17946,12 +17946,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -17961,37 +17961,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.6953125,
+          "logprob": -9.7109375,
           "text": "<image>"
         },
         {
@@ -18001,27 +18001,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3203125,
+          "logprob": -14.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
@@ -18036,7 +18036,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
@@ -18051,22 +18051,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
@@ -18076,7 +18076,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5703125,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
@@ -18091,17 +18091,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
@@ -18111,17 +18111,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.03125,
+          "logprob": -10.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7890625,
+          "logprob": -13.8125,
           "text": "<image>"
         },
         {
@@ -18131,42 +18131,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
@@ -18176,37 +18176,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.546875,
+          "logprob": -16.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4765625,
+          "logprob": -14.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
@@ -18216,7 +18216,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -18226,12 +18226,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
@@ -18241,117 +18241,117 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7890625,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0234375,
+          "logprob": -15.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9453125,
+          "logprob": -14.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
@@ -18361,37 +18361,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9609375,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.140625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4609375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.609375,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
@@ -18401,7 +18401,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
@@ -18426,12 +18426,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.046875,
+          "logprob": -17.0625,
           "text": "<image>"
         },
         {
@@ -18441,12 +18441,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
@@ -18456,42 +18456,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9375,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.46875,
+          "logprob": -10.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.140625,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
@@ -18501,52 +18501,52 @@
         },
         {
           "id": 32000,
-          "logprob": -16.90625,
+          "logprob": -16.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1328125,
+          "logprob": -14.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -18556,17 +18556,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.09375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
@@ -18576,32 +18576,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8359375,
+          "logprob": -14.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1640625,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.4921875,
           "text": "<image>"
         },
         {
@@ -18611,7 +18611,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
@@ -18621,32 +18621,32 @@
         },
         {
           "id": 32000,
-          "logprob": -16.328125,
+          "logprob": -16.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9921875,
+          "logprob": -14.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.40625,
+          "logprob": -15.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
@@ -18661,7 +18661,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
@@ -18676,82 +18676,82 @@
         },
         {
           "id": 32000,
-          "logprob": -16.890625,
+          "logprob": -16.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.75,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0546875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
@@ -18761,77 +18761,77 @@
         },
         {
           "id": 32000,
-          "logprob": -13.640625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.15625,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.25,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6484375,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0078125,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.9765625,
           "text": "<image>"
         },
         {
@@ -18841,17 +18841,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
@@ -18866,12 +18866,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7578125,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
@@ -18881,12 +18881,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -18901,17 +18901,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
@@ -18921,37 +18921,37 @@
         },
         {
           "id": 32000,
-          "logprob": -15.953125,
+          "logprob": -15.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
@@ -18961,47 +18961,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
@@ -19011,17 +19011,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0703125,
+          "logprob": -15.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
@@ -19036,27 +19036,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3515625,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
@@ -19081,12 +19081,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -19096,22 +19096,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.078125,
+          "logprob": -14.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.03125,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -19121,42 +19121,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.46875,
+          "logprob": -14.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.703125,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.296875,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8828125,
+          "logprob": -14.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1796875,
+          "logprob": -14.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -19171,12 +19171,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7109375,
           "text": "<image>"
         },
         {
@@ -19186,32 +19186,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -19221,12 +19221,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -19236,37 +19236,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4609375,
+          "logprob": -14.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.65625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
@@ -19286,22 +19286,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -19311,32 +19311,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -19346,27 +19346,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -19376,12 +19376,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0703125,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.984375,
           "text": "<image>"
         },
         {
@@ -19391,7 +19391,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.2109375,
           "text": "<image>"
         },
         {
@@ -19401,42 +19401,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.328125,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
@@ -19451,17 +19451,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -19476,17 +19476,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2578125,
+          "logprob": -15.2265625,
           "text": "<image>"
         },
         {
@@ -19496,17 +19496,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0078125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3828125,
+          "logprob": -15.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
@@ -19516,7 +19516,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0703125,
+          "logprob": -11.0625,
           "text": "<image>"
         },
         {
@@ -19526,7 +19526,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -19536,27 +19536,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.15625,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
@@ -19571,22 +19571,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
@@ -19596,22 +19596,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
@@ -19621,7 +19621,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
@@ -19631,127 +19631,127 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.296875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2109375,
+          "logprob": -15.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
@@ -19761,7 +19761,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.5234375,
           "text": "<image>"
         },
         {
@@ -19771,27 +19771,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
@@ -19801,22 +19801,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0390625,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
@@ -19826,12 +19826,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
@@ -19841,32 +19841,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4765625,
+          "logprob": -14.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -13.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
@@ -19876,22 +19876,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.640625,
+          "logprob": -14.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
@@ -19901,7 +19901,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.4375,
+          "logprob": -15.4296875,
           "text": "<image>"
         },
         {
@@ -19911,32 +19911,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.21875,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
@@ -19946,17 +19946,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
@@ -19966,22 +19966,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8203125,
           "text": "<image>"
         },
         {
@@ -19991,7 +19991,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -20001,7 +20001,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1328125,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
@@ -20026,7 +20026,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
@@ -20036,7 +20036,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.15625,
+          "logprob": -11.1484375,
           "text": "<image>"
         },
         {
@@ -20046,17 +20046,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.09375,
+          "logprob": -13.0859375,
           "text": "<image>"
         },
         {
@@ -20066,22 +20066,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9609375,
+          "logprob": -13.9765625,
           "text": "<image>"
         },
         {
@@ -20091,7 +20091,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
@@ -20101,22 +20101,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.234375,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
@@ -20126,127 +20126,127 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.640625,
+          "logprob": -15.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.609375,
+          "logprob": -13.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
@@ -20256,7 +20256,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
@@ -20276,12 +20276,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -20291,17 +20291,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6796875,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -20316,27 +20316,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
@@ -20346,17 +20346,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5078125,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -20366,32 +20366,32 @@
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.40625,
+          "logprob": -14.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.296875,
+          "logprob": -15.3203125,
           "text": "<image>"
         },
         {
@@ -20401,12 +20401,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
@@ -20416,7 +20416,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
@@ -20426,12 +20426,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
@@ -20441,12 +20441,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
@@ -20456,52 +20456,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2734375,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
@@ -20511,37 +20511,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.546875,
+          "logprob": -14.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -20551,12 +20551,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.515625,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.078125,
           "text": "<image>"
         },
         {
@@ -20566,72 +20566,72 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5703125,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.125,
+          "logprob": -14.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
@@ -20641,97 +20641,97 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4140625,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1328125,
           "text": "<image>"
         },
         {
@@ -20741,27 +20741,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
@@ -20776,27 +20776,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.78125,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
@@ -20806,17 +20806,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
@@ -20826,37 +20826,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -20866,112 +20866,112 @@
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5390625,
+          "logprob": -15.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2109375,
+          "logprob": -14.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.390625,
+          "logprob": -14.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.984375,
+          "logprob": -14.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -20981,27 +20981,27 @@
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
@@ -21021,7 +21021,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5546875,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
@@ -21031,72 +21031,72 @@
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.171875,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
@@ -21106,17 +21106,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0625,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
@@ -21126,12 +21126,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
@@ -21141,42 +21141,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -21186,12 +21186,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -14.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
@@ -21201,17 +21201,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9765625,
+          "logprob": -14.96875,
           "text": "<image>"
         },
         {
@@ -21221,32 +21221,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.703125,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -21256,17 +21256,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -21276,7 +21276,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
@@ -21286,67 +21286,67 @@
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9375,
+          "logprob": -14.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4609375,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -21356,7 +21356,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7578125,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
@@ -21366,12 +21366,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.140625,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
@@ -21381,7 +21381,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
@@ -21391,27 +21391,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
@@ -21421,17 +21421,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -21441,22 +21441,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -21466,32 +21466,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
@@ -21506,7 +21506,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.109375,
+          "logprob": -11.1015625,
           "text": "<image>"
         },
         {
@@ -21526,67 +21526,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
@@ -21596,17 +21596,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.765625,
+          "logprob": -13.7578125,
           "text": "<image>"
         },
         {
@@ -21616,37 +21616,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.53125,
+          "logprob": -14.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
@@ -21661,62 +21661,62 @@
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.21875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
@@ -21731,47 +21731,47 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.03125,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
@@ -21796,47 +21796,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.765625,
+          "logprob": -15.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9140625,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -21846,92 +21846,92 @@
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.25,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
@@ -21941,12 +21941,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -21981,22 +21981,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6640625,
+          "logprob": -13.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -22021,57 +22021,57 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
@@ -22081,7 +22081,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
@@ -22091,7 +22091,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -22106,37 +22106,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5625,
+          "logprob": -14.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7109375,
+          "logprob": -10.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -22151,32 +22151,32 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.296875,
+          "logprob": -13.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -22191,27 +22191,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -22221,27 +22221,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
@@ -22251,7 +22251,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
@@ -22261,7 +22261,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -22276,37 +22276,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.03125,
+          "logprob": -10.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.671875,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
@@ -22316,32 +22316,32 @@
         },
         {
           "id": 32000,
-          "logprob": -15.1796875,
+          "logprob": -15.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6484375,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -22351,67 +22351,67 @@
         },
         {
           "id": 32000,
-          "logprob": -14.390625,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.453125,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7890625,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3828125,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -22421,12 +22421,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
@@ -22436,17 +22436,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.015625,
+          "logprob": -13.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -22456,7 +22456,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
@@ -22471,7 +22471,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
@@ -22481,7 +22481,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -22491,7 +22491,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
@@ -22501,12 +22501,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.21875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
@@ -22516,12 +22516,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
@@ -22531,32 +22531,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6640625,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -15.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -22566,7 +22566,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -22576,27 +22576,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1015625,
+          "logprob": -14.109375,
           "text": "<image>"
         },
         {
@@ -22606,17 +22606,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -22626,27 +22626,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.21875,
           "text": "<image>"
         },
         {
@@ -22656,52 +22656,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.1875,
+          "logprob": -15.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.515625,
+          "logprob": -13.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -22711,12 +22711,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5546875,
+          "logprob": -14.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.4140625,
           "text": "<image>"
         },
         {
@@ -22726,7 +22726,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
@@ -22741,7 +22741,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -22761,27 +22761,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1875,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.359375,
+          "logprob": -13.3359375,
           "text": "<image>"
         },
         {
@@ -22791,7 +22791,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -22806,32 +22806,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
@@ -22841,42 +22841,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
@@ -22886,17 +22886,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -22911,52 +22911,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
@@ -22966,22 +22966,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1640625,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
@@ -22991,7 +22991,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
@@ -23001,12 +23001,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
@@ -23026,12 +23026,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
@@ -23041,22 +23041,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.15625,
+          "logprob": -14.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
@@ -23066,47 +23066,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6015625,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.078125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
@@ -23116,17 +23116,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
@@ -23136,7 +23136,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -23151,42 +23151,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
@@ -23196,17 +23196,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7890625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
@@ -23216,7 +23216,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
@@ -23231,17 +23231,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -23251,17 +23251,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.109375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
@@ -23271,27 +23271,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.28125,
+          "logprob": -15.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
@@ -23301,12 +23301,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0078125,
+          "logprob": -14.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
@@ -23321,32 +23321,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9765625,
+          "logprob": -13.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.890625,
+          "logprob": -14.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -23356,7 +23356,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
@@ -23366,7 +23366,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -23376,7 +23376,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -23386,7 +23386,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
@@ -23396,7 +23396,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -23406,7 +23406,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -23416,22 +23416,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
@@ -23441,7 +23441,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.765625,
           "text": "<image>"
         },
         {
@@ -23456,37 +23456,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
@@ -23496,22 +23496,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
@@ -23521,112 +23521,112 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.96875,
+          "logprob": -15.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5859375,
+          "logprob": -10.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0078125,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3359375,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -23646,22 +23646,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
@@ -23671,12 +23671,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
@@ -23686,7 +23686,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.9140625,
           "text": "<image>"
         },
         {
@@ -23706,17 +23706,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
@@ -23726,17 +23726,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.140625,
+          "logprob": -11.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.0859375,
           "text": "<image>"
         },
         {
@@ -23746,12 +23746,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.578125,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -23766,22 +23766,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
@@ -23796,7 +23796,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.5703125,
           "text": "<image>"
         },
         {
@@ -23806,37 +23806,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1796875,
+          "logprob": -11.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
@@ -23846,17 +23846,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3359375,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
@@ -23866,17 +23866,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6875,
+          "logprob": -14.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -23901,12 +23901,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -23921,27 +23921,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.875,
           "text": "<image>"
         },
         {
@@ -23961,12 +23961,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
@@ -23996,22 +23996,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
@@ -24021,62 +24021,62 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.625,
+          "logprob": -9.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8515625,
+          "logprob": -10.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.015625,
+          "logprob": -14.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -24086,22 +24086,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
@@ -24111,7 +24111,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.21875,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
@@ -24121,7 +24121,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
@@ -24131,12 +24131,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6953125,
+          "logprob": -15.671875,
           "text": "<image>"
         },
         {
@@ -24151,7 +24151,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -24161,22 +24161,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9453125,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
@@ -24191,7 +24191,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
@@ -24201,7 +24201,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
@@ -24216,7 +24216,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -24231,27 +24231,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
@@ -24261,67 +24261,67 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1796875,
+          "logprob": -14.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.046875,
+          "logprob": -17.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -15.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.625,
+          "logprob": -10.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
@@ -24331,37 +24331,37 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9296875,
+          "logprob": -10.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
@@ -24371,27 +24371,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2421875,
+          "logprob": -15.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
@@ -24401,7 +24401,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -24421,32 +24421,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
@@ -24456,32 +24456,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.125,
           "text": "<image>"
         },
         {
@@ -24491,67 +24491,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5546875,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3984375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8203125,
           "text": "<image>"
         },
         {
@@ -24561,37 +24561,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.984375,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
@@ -24601,32 +24601,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8125,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.5,
+          "logprob": -16.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1171875,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
@@ -24636,67 +24636,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6015625,
+          "logprob": -10.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
@@ -24706,7 +24706,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.921875,
+          "logprob": -13.90625,
           "text": "<image>"
         },
         {
@@ -24721,12 +24721,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
@@ -24736,7 +24736,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
@@ -24751,37 +24751,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -13.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.125,
+          "logprob": -15.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.046875,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -24796,62 +24796,62 @@
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -14.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.890625,
+          "logprob": -13.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3203125,
+          "logprob": -15.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
@@ -24861,52 +24861,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9609375,
+          "logprob": -14.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.171875,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -24936,17 +24936,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3125,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
@@ -24956,27 +24956,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.515625,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -24986,42 +24986,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0078125,
+          "logprob": -14.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.21875,
+          "logprob": -15.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.84375,
+          "logprob": -17.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.515625,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
@@ -25031,17 +25031,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4609375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.84375,
+          "logprob": -14.828125,
           "text": "<image>"
         },
         {
@@ -25051,22 +25051,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
@@ -25076,37 +25076,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.890625,
+          "logprob": -15.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
@@ -25116,7 +25116,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
@@ -25126,22 +25126,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
@@ -25151,12 +25151,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.734375,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -25166,12 +25166,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
@@ -25181,12 +25181,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
@@ -25206,22 +25206,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
@@ -25241,7 +25241,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
@@ -25251,27 +25251,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.328125,
+          "logprob": -15.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
@@ -25281,17 +25281,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.421875,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
@@ -25301,12 +25301,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -25321,12 +25321,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.375,
+          "logprob": -10.3828125,
           "text": "<image>"
         },
         {
@@ -25336,7 +25336,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
@@ -25346,7 +25346,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -25356,12 +25356,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
@@ -25371,12 +25371,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -25391,7 +25391,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
@@ -25401,17 +25401,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.59375,
+          "logprob": -16.578125,
           "text": "<image>"
         },
         {
@@ -25421,7 +25421,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -25436,22 +25436,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -25461,12 +25461,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4375,
           "text": "<image>"
         },
         {
@@ -25476,22 +25476,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.8203125,
+          "logprob": -15.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
@@ -25501,42 +25501,42 @@
         },
         {
           "id": 32000,
-          "logprob": -10.109375,
+          "logprob": -10.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.015625,
+          "logprob": -16.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.703125,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
@@ -25546,32 +25546,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6640625,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6640625,
+          "logprob": -15.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.046875,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
@@ -25581,7 +25581,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.953125,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -25596,17 +25596,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
@@ -25616,77 +25616,77 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
@@ -25696,12 +25696,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.734375,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -25711,12 +25711,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -25726,17 +25726,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
@@ -25746,7 +25746,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.6953125,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
@@ -25756,47 +25756,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4140625,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9453125,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -25806,77 +25806,77 @@
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.34375,
+          "logprob": -17.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1953125,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -25886,7 +25886,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -25896,17 +25896,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
@@ -25916,17 +25916,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
@@ -25936,17 +25936,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.6171875,
           "text": "<image>"
         },
         {
@@ -25956,52 +25956,52 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -26011,7 +26011,7 @@
         },
         {
           "id": 32000,
-          "logprob": -17.109375,
+          "logprob": -17.09375,
           "text": "<image>"
         },
         {
@@ -26021,17 +26021,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
@@ -26041,47 +26041,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.09375,
+          "logprob": -15.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.8515625,
+          "logprob": -15.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
@@ -26091,12 +26091,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -26106,12 +26106,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -26121,12 +26121,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7109375,
+          "logprob": -12.6796875,
           "text": "<image>"
         },
         {
@@ -26136,7 +26136,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
@@ -26146,22 +26146,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2265625,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9296875,
+          "logprob": -10.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.53125,
           "text": "<image>"
         },
         {
@@ -26171,12 +26171,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.953125,
+          "logprob": -17.0,
           "text": "<image>"
         },
         {
@@ -26186,12 +26186,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -26206,12 +26206,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -26221,12 +26221,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.421875,
           "text": "<image>"
         },
         {
@@ -26241,7 +26241,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -26251,17 +26251,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.234375,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -26271,37 +26271,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.203125,
+          "logprob": -13.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.078125,
+          "logprob": -17.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.25,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -26311,17 +26311,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.0625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
@@ -26331,17 +26331,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -26356,22 +26356,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
@@ -26381,7 +26381,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -26391,7 +26391,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.328125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
@@ -26401,22 +26401,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.84375,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2109375,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9375,
           "text": "<image>"
         },
         {
@@ -26426,27 +26426,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6796875,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.65625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.0703125,
           "text": "<image>"
         },
         {
@@ -26466,7 +26466,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
@@ -26481,22 +26481,22 @@
         },
         {
           "id": 32000,
-          "logprob": -16.109375,
+          "logprob": -16.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -26506,57 +26506,57 @@
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.21875,
+          "logprob": -15.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8984375,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.4375,
+          "logprob": -17.453125,
           "text": "<image>"
         },
         {
@@ -26571,7 +26571,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
@@ -26586,12 +26586,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2421875,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -26606,57 +26606,57 @@
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7421875,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.71875,
+          "logprob": -14.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0078125,
+          "logprob": -11.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.46875,
+          "logprob": -16.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
@@ -26666,7 +26666,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -26681,12 +26681,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.359375,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
@@ -26701,27 +26701,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
@@ -26736,7 +26736,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.046875,
           "text": "<image>"
         },
         {
@@ -26746,7 +26746,7 @@
         },
         {
           "id": 32000,
-          "logprob": -17.21875,
+          "logprob": -17.203125,
           "text": "<image>"
         },
         {
@@ -26756,17 +26756,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6171875,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -26776,12 +26776,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.03125,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -26791,7 +26791,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.890625,
+          "logprob": -15.9296875,
           "text": "<image>"
         },
         {
@@ -26816,22 +26816,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7578125,
+          "logprob": -14.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -26841,7 +26841,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.7265625,
+          "logprob": -10.734375,
           "text": "<image>"
         },
         {
@@ -26851,22 +26851,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
@@ -26876,7 +26876,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
@@ -26891,7 +26891,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
@@ -26901,7 +26901,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.265625,
+          "logprob": -15.2578125,
           "text": "<image>"
         },
         {
@@ -26916,7 +26916,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
@@ -26931,12 +26931,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
@@ -26946,7 +26946,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
@@ -26956,27 +26956,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3828125,
+          "logprob": -14.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -26986,27 +26986,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.25,
+          "logprob": -16.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.8125,
+          "logprob": -16.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.359375,
           "text": "<image>"
         },
         {
@@ -27016,12 +27016,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
@@ -27036,77 +27036,77 @@
         },
         {
           "id": 32000,
-          "logprob": -17.5625,
+          "logprob": -17.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.75,
+          "logprob": -16.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
@@ -27116,7 +27116,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
@@ -27136,7 +27136,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
@@ -27146,12 +27146,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -27161,22 +27161,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
@@ -27196,12 +27196,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
@@ -27211,27 +27211,27 @@
         },
         {
           "id": 32000,
-          "logprob": -16.640625,
+          "logprob": -16.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -27241,12 +27241,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
@@ -27256,12 +27256,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5390625,
+          "logprob": -14.546875,
           "text": "<image>"
         },
         {
@@ -27271,12 +27271,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6796875,
+          "logprob": -13.65625,
           "text": "<image>"
         },
         {
@@ -27286,77 +27286,77 @@
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.625,
+          "logprob": -17.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.75,
+          "logprob": -17.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5703125,
+          "logprob": -10.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
@@ -27366,42 +27366,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3046875,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9375,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -27411,27 +27411,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -27441,42 +27441,42 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0546875,
+          "logprob": -15.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.84375,
+          "logprob": -16.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9296875,
+          "logprob": -14.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
@@ -27486,27 +27486,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.109375,
+          "logprob": -10.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -27516,12 +27516,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
@@ -27531,7 +27531,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
@@ -27541,12 +27541,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8671875,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -27561,32 +27561,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.453125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.890625,
+          "logprob": -16.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -27601,37 +27601,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.6875,
+          "logprob": -17.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
@@ -27641,22 +27641,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.203125,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
@@ -27666,7 +27666,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -27676,12 +27676,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
@@ -27691,12 +27691,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -27706,22 +27706,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.46875,
+          "logprob": -16.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
@@ -27731,22 +27731,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.109375,
+          "logprob": -13.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.34375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
@@ -27756,17 +27756,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -27776,22 +27776,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
@@ -27801,22 +27801,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6484375,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -27826,7 +27826,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.2265625,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
@@ -27836,7 +27836,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
@@ -27851,52 +27851,52 @@
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5,
+          "logprob": -15.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7734375,
+          "logprob": -13.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -27906,27 +27906,27 @@
         },
         {
           "id": 32000,
-          "logprob": -16.390625,
+          "logprob": -16.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8984375,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9921875,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
@@ -27941,7 +27941,7 @@
         },
         {
           "id": 32000,
-          "logprob": -16.046875,
+          "logprob": -16.03125,
           "text": "<image>"
         },
         {
@@ -27951,12 +27951,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6484375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3203125,
+          "logprob": -15.3359375,
           "text": "<image>"
         },
         {
@@ -27966,42 +27966,42 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.4765625,
+          "logprob": -16.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
@@ -28011,12 +28011,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.8359375,
+          "logprob": -15.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.65625,
+          "logprob": -17.703125,
           "text": "<image>"
         },
         {
@@ -28026,22 +28026,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -28051,17 +28051,17 @@
         },
         {
           "id": 32000,
-          "logprob": -16.6875,
+          "logprob": -16.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.90625,
           "text": "<image>"
         },
         {
@@ -28071,12 +28071,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.203125,
+          "logprob": -15.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
@@ -28086,27 +28086,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.8125,
+          "logprob": -16.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8671875,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -28116,12 +28116,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -28131,7 +28131,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.765625,
+          "logprob": -13.7578125,
           "text": "<image>"
         },
         {
@@ -28146,17 +28146,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -28166,22 +28166,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -28191,12 +28191,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -28206,27 +28206,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.703125,
+          "logprob": -18.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6484375,
+          "logprob": -15.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.703125,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -28241,17 +28241,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
@@ -28261,42 +28261,42 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3515625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.15625,
+          "logprob": -14.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -13.8515625,
           "text": "<image>"
         },
         {
@@ -28306,12 +28306,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
@@ -28321,12 +28321,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -28336,12 +28336,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
@@ -28351,27 +28351,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
@@ -28381,27 +28381,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.671875,
+          "logprob": -16.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
@@ -28411,7 +28411,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.9453125,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
@@ -28421,82 +28421,82 @@
         },
         {
           "id": 32000,
-          "logprob": -15.203125,
+          "logprob": -15.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.796875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.484375,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6328125,
+          "logprob": -14.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.75,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3125,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -28506,12 +28506,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.484375,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
@@ -28526,17 +28526,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.53125,
+          "logprob": -16.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
@@ -28546,17 +28546,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.03125,
+          "logprob": -15.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.484375,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
@@ -28571,37 +28571,37 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.0,
+          "logprob": -17.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
@@ -28616,12 +28616,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
@@ -28631,27 +28631,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.125,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9140625,
+          "logprob": -14.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.546875,
+          "logprob": -15.6875,
           "text": "<image>"
         },
         {
@@ -28661,22 +28661,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.109375,
+          "logprob": -14.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -28686,7 +28686,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7734375,
+          "logprob": -14.765625,
           "text": "<image>"
         },
         {
@@ -28696,12 +28696,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -28711,17 +28711,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
@@ -28736,27 +28736,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -28766,47 +28766,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -19.65625,
+          "logprob": -19.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
@@ -28816,37 +28816,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.859375,
+          "logprob": -13.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
@@ -28856,47 +28856,47 @@
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8984375,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.234375,
+          "logprob": -16.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2109375,
+          "logprob": -15.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5703125,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
@@ -28906,47 +28906,47 @@
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -14.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
@@ -28956,87 +28956,87 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0703125,
+          "logprob": -15.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0390625,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5546875,
+          "logprob": -12.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.375,
+          "logprob": -16.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8828125,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6171875,
+          "logprob": -14.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
@@ -29051,22 +29051,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5625,
+          "logprob": -14.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
@@ -29076,62 +29076,62 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.953125,
+          "logprob": -14.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6484375,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
@@ -29146,17 +29146,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.1953125,
+          "logprob": -15.1875,
           "text": "<image>"
         },
         {
@@ -29166,17 +29166,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0546875,
+          "logprob": -15.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.59375,
+          "logprob": -15.5703125,
           "text": "<image>"
         },
         {
@@ -29186,22 +29186,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.90625,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.671875,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
@@ -29211,17 +29211,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.2265625,
+          "logprob": -15.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6875,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
@@ -29231,37 +29231,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.515625,
+          "logprob": -16.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3984375,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
@@ -29276,17 +29276,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
@@ -29296,67 +29296,67 @@
         },
         {
           "id": 32000,
-          "logprob": -14.734375,
+          "logprob": -14.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3984375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.578125,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.578125,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.421875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.34375,
+          "logprob": -14.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.890625,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
@@ -29366,7 +29366,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
@@ -29376,12 +29376,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
@@ -29396,17 +29396,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5078125,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
@@ -29416,22 +29416,22 @@
         },
         {
           "id": 32000,
-          "logprob": -17.3125,
+          "logprob": -16.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5234375,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.625,
+          "logprob": -17.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -29441,17 +29441,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.9140625,
+          "logprob": -16.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.65625,
+          "logprob": -16.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -29461,17 +29461,17 @@
         },
         {
           "id": 368,
-          "logprob": -0.19726562,
+          "logprob": -0.19604492,
           "text": "you"
         },
         {
           "id": 1912,
-          "logprob": -1.4990234,
+          "logprob": -1.5058594,
           "text": "tell"
         },
         {
           "id": 528,
-          "logprob": -0.31152344,
+          "logprob": -0.31030273,
           "text": "me"
         },
         {
@@ -29486,22 +29486,22 @@
         },
         {
           "id": 2485,
-          "logprob": -0.9941406,
+          "logprob": -0.9975586,
           "text": "short"
         },
         {
           "id": 2838,
-          "logprob": -0.46118164,
+          "logprob": -0.4633789,
           "text": "story"
         },
         {
           "id": 2818,
-          "logprob": -3.3183594,
+          "logprob": -3.3144531,
           "text": "based"
         },
         {
           "id": 356,
-          "logprob": -0.029129028,
+          "logprob": -0.029037476,
           "text": "on"
         },
         {
@@ -29511,12 +29511,12 @@
         },
         {
           "id": 3469,
-          "logprob": -0.29052734,
+          "logprob": -0.2890625,
           "text": "image"
         },
         {
           "id": 28804,
-          "logprob": -0.43188477,
+          "logprob": -0.42895508,
           "text": "?"
         }
       ],
@@ -29524,55 +29524,55 @@
       "tokens": [
         {
           "id": 13,
-          "logprob": -0.0076828003,
+          "logprob": -0.007621765,
           "special": false,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.19958496,
+          "logprob": -0.20275879,
           "special": false,
           "text": "\n"
         },
         {
           "id": 16114,
-          "logprob": -1.2587891,
+          "logprob": -1.2578125,
           "special": false,
           "text": "Once"
         },
         {
           "id": 3714,
-          "logprob": -0.20861816,
+          "logprob": -0.2084961,
           "special": false,
           "text": " upon"
         },
         {
           "id": 264,
-          "logprob": -0.0017719269,
+          "logprob": -0.0017738342,
           "special": false,
           "text": " a"
         },
         {
           "id": 727,
-          "logprob": -0.011749268,
+          "logprob": -0.011932373,
           "special": false,
           "text": " time"
         },
         {
           "id": 28725,
-          "logprob": -0.17529297,
+          "logprob": -0.17297363,
           "special": false,
           "text": ","
         },
         {
           "id": 736,
-          "logprob": -0.9086914,
+          "logprob": -0.9057617,
           "special": false,
           "text": " there"
         },
         {
           "id": 403,
-          "logprob": -0.056732178,
+          "logprob": -0.05758667,
           "special": false,
           "text": " was"
         },
@@ -29600,7 +29600,7 @@
         },
         {
           "id": 1247,
-          "logprob": -2.3886719,
+          "logprob": -2.390625,
           "text": "User"
         },
         {
@@ -29615,12 +29615,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.671875,
+          "logprob": -10.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7109375,
+          "logprob": -15.828125,
           "text": "<image>"
         },
         {
@@ -29630,82 +29630,82 @@
         },
         {
           "id": 32000,
-          "logprob": -10.0234375,
+          "logprob": -10.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1328125,
+          "logprob": -10.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.421875,
+          "logprob": -10.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.59375,
+          "logprob": -15.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1171875,
+          "logprob": -10.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1640625,
+          "logprob": -10.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.234375,
+          "logprob": -10.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3984375,
+          "logprob": -10.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -13.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.359375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
@@ -29715,12 +29715,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.4765625,
+          "logprob": -10.4453125,
           "text": "<image>"
         },
         {
@@ -29730,22 +29730,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.34375,
+          "logprob": -17.234375,
           "text": "<image>"
         },
         {
@@ -29760,12 +29760,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.390625,
+          "logprob": -17.984375,
           "text": "<image>"
         },
         {
@@ -29775,17 +29775,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.875,
+          "logprob": -9.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7734375,
+          "logprob": -10.7578125,
           "text": "<image>"
         },
         {
@@ -29795,32 +29795,32 @@
         },
         {
           "id": 32000,
-          "logprob": -10.96875,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5078125,
+          "logprob": -10.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
@@ -29830,127 +29830,127 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7421875,
+          "logprob": -10.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3828125,
+          "logprob": -10.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.171875,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.40625,
+          "logprob": -18.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -11.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7109375,
+          "logprob": -15.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2421875,
+          "logprob": -10.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2265625,
+          "logprob": -10.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2578125,
+          "logprob": -10.265625,
           "text": "<image>"
         },
         {
@@ -29960,32 +29960,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7890625,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
@@ -29995,27 +29995,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.4453125,
+          "logprob": -15.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2109375,
+          "logprob": -10.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6796875,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3671875,
+          "logprob": -10.375,
           "text": "<image>"
         },
         {
@@ -30030,27 +30030,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.96875,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
@@ -30070,27 +30070,27 @@
         },
         {
           "id": 32000,
-          "logprob": -10.703125,
+          "logprob": -10.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.71875,
+          "logprob": -10.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
@@ -30100,12 +30100,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7109375,
+          "logprob": -10.7265625,
           "text": "<image>"
         },
         {
@@ -30120,17 +30120,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6875,
+          "logprob": -10.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -30140,22 +30140,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2265625,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -30165,7 +30165,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.21875,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
@@ -30175,52 +30175,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.140625,
+          "logprob": -15.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -30230,72 +30230,72 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8828125,
+          "logprob": -10.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1953125,
+          "logprob": -11.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -30305,12 +30305,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
@@ -30325,27 +30325,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2890625,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.7578125,
           "text": "<image>"
         },
         {
@@ -30360,7 +30360,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
@@ -30370,12 +30370,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9765625,
+          "logprob": -17.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -30385,12 +30385,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -30400,12 +30400,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
@@ -30415,67 +30415,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.359375,
+          "logprob": -10.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.015625,
+          "logprob": -11.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7421875,
+          "logprob": -10.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
@@ -30485,17 +30485,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6171875,
+          "logprob": -10.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8203125,
           "text": "<image>"
         },
         {
@@ -30515,67 +30515,67 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2890625,
+          "logprob": -10.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.234375,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.515625,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1328125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -30585,92 +30585,92 @@
         },
         {
           "id": 32000,
-          "logprob": -10.359375,
+          "logprob": -10.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6171875,
+          "logprob": -10.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1796875,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.203125,
+          "logprob": -14.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.6015625,
           "text": "<image>"
         },
         {
@@ -30680,32 +30680,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -30715,12 +30715,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
@@ -30730,7 +30730,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
@@ -30740,57 +30740,57 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9921875,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7109375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7890625,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
@@ -30800,17 +30800,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.859375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
@@ -30820,22 +30820,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.171875,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
@@ -30845,42 +30845,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.484375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.921875,
+          "logprob": -14.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.171875,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
@@ -30890,37 +30890,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1875,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.171875,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1953125,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -30930,17 +30930,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
@@ -30950,27 +30950,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
@@ -30980,32 +30980,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -31025,12 +31025,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0234375,
+          "logprob": -15.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
@@ -31040,47 +31040,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8828125,
+          "logprob": -14.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -31090,12 +31090,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -31115,12 +31115,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
@@ -31135,7 +31135,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -31150,47 +31150,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1953125,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6640625,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5703125,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
@@ -31200,57 +31200,57 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7890625,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4296875,
+          "logprob": -14.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
@@ -31260,37 +31260,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -31300,7 +31300,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -31320,12 +31320,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.109375,
+          "logprob": -15.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
@@ -31335,12 +31335,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2578125,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
@@ -31350,7 +31350,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
@@ -31360,22 +31360,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7734375,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -31385,62 +31385,62 @@
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.21875,
+          "logprob": -16.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
@@ -31450,17 +31450,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -31475,17 +31475,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -31495,27 +31495,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.1875,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.421875,
+          "logprob": -14.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -31530,12 +31530,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.859375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -31545,12 +31545,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.953125,
           "text": "<image>"
         },
         {
@@ -31565,47 +31565,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3515625,
+          "logprob": -15.34375,
           "text": "<image>"
         },
         {
@@ -31620,52 +31620,52 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
@@ -31680,27 +31680,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.296875,
           "text": "<image>"
         },
         {
@@ -31710,22 +31710,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -31735,37 +31735,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
@@ -31775,12 +31775,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
@@ -31790,42 +31790,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
@@ -31835,12 +31835,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -31850,52 +31850,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.875,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2890625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
@@ -31905,22 +31905,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -31930,17 +31930,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -31950,27 +31950,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.390625,
+          "logprob": -15.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.140625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.4921875,
+          "logprob": -15.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.921875,
           "text": "<image>"
         },
         {
@@ -31985,7 +31985,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.984375,
+          "logprob": -16.0,
           "text": "<image>"
         },
         {
@@ -31995,37 +31995,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
@@ -32035,12 +32035,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
@@ -32050,27 +32050,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -32080,77 +32080,77 @@
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.078125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
@@ -32165,27 +32165,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.921875,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
@@ -32205,12 +32205,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
@@ -32220,12 +32220,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2109375,
           "text": "<image>"
         },
         {
@@ -32245,17 +32245,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -32265,12 +32265,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
@@ -32285,7 +32285,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
@@ -32300,7 +32300,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
@@ -32315,27 +32315,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1875,
+          "logprob": -13.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.796875,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -32345,27 +32345,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6640625,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7890625,
+          "logprob": -15.7734375,
           "text": "<image>"
         },
         {
@@ -32375,7 +32375,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
@@ -32385,17 +32385,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.21875,
+          "logprob": -16.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -32405,27 +32405,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
@@ -32435,12 +32435,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
@@ -32450,87 +32450,87 @@
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.15625,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.46875,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.515625,
+          "logprob": -10.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.015625,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.375,
+          "logprob": -16.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
@@ -32540,52 +32540,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2109375,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.9375,
+          "logprob": -9.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2734375,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
@@ -32600,72 +32600,72 @@
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8359375,
+          "logprob": -14.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5859375,
+          "logprob": -10.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5390625,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2578125,
+          "logprob": -15.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1171875,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -32675,7 +32675,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -11.0078125,
           "text": "<image>"
         },
         {
@@ -32685,22 +32685,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
@@ -32710,7 +32710,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
@@ -32720,17 +32720,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
@@ -32740,12 +32740,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -32755,37 +32755,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.6953125,
+          "logprob": -9.7109375,
           "text": "<image>"
         },
         {
@@ -32795,27 +32795,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3203125,
+          "logprob": -14.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
@@ -32830,7 +32830,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
@@ -32845,22 +32845,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
@@ -32870,7 +32870,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5703125,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
@@ -32885,17 +32885,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
@@ -32905,17 +32905,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.03125,
+          "logprob": -10.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7890625,
+          "logprob": -13.8125,
           "text": "<image>"
         },
         {
@@ -32925,42 +32925,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
@@ -32970,37 +32970,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.546875,
+          "logprob": -16.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4765625,
+          "logprob": -14.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
@@ -33010,7 +33010,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -33020,12 +33020,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
@@ -33035,117 +33035,117 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7890625,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0234375,
+          "logprob": -15.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9453125,
+          "logprob": -14.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
@@ -33155,37 +33155,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9609375,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.140625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4609375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.609375,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
@@ -33195,7 +33195,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
@@ -33220,12 +33220,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.046875,
+          "logprob": -17.0625,
           "text": "<image>"
         },
         {
@@ -33235,12 +33235,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
@@ -33250,42 +33250,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9375,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.46875,
+          "logprob": -10.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.140625,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
@@ -33295,52 +33295,52 @@
         },
         {
           "id": 32000,
-          "logprob": -16.90625,
+          "logprob": -16.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1328125,
+          "logprob": -14.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -33350,17 +33350,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.09375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
@@ -33370,32 +33370,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8359375,
+          "logprob": -14.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1640625,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.4921875,
           "text": "<image>"
         },
         {
@@ -33405,7 +33405,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
@@ -33415,32 +33415,32 @@
         },
         {
           "id": 32000,
-          "logprob": -16.328125,
+          "logprob": -16.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9921875,
+          "logprob": -14.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.40625,
+          "logprob": -15.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
@@ -33455,7 +33455,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
@@ -33470,82 +33470,82 @@
         },
         {
           "id": 32000,
-          "logprob": -16.890625,
+          "logprob": -16.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.75,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0546875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
@@ -33555,77 +33555,77 @@
         },
         {
           "id": 32000,
-          "logprob": -13.640625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.15625,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.25,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6484375,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0078125,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.9765625,
           "text": "<image>"
         },
         {
@@ -33635,17 +33635,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
@@ -33660,12 +33660,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7578125,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
@@ -33675,12 +33675,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -33695,17 +33695,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
@@ -33715,37 +33715,37 @@
         },
         {
           "id": 32000,
-          "logprob": -15.953125,
+          "logprob": -15.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
@@ -33755,47 +33755,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
@@ -33805,17 +33805,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0703125,
+          "logprob": -15.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
@@ -33830,27 +33830,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3515625,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
@@ -33875,12 +33875,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -33890,22 +33890,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.078125,
+          "logprob": -14.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.03125,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -33915,42 +33915,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.46875,
+          "logprob": -14.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.703125,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.296875,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8828125,
+          "logprob": -14.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1796875,
+          "logprob": -14.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -33965,12 +33965,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7109375,
           "text": "<image>"
         },
         {
@@ -33980,32 +33980,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -34015,12 +34015,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -34030,37 +34030,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4609375,
+          "logprob": -14.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.65625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
@@ -34080,22 +34080,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -34105,32 +34105,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -34140,27 +34140,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -34170,12 +34170,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0703125,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.984375,
           "text": "<image>"
         },
         {
@@ -34185,7 +34185,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.2109375,
           "text": "<image>"
         },
         {
@@ -34195,42 +34195,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.328125,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
@@ -34245,17 +34245,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -34270,17 +34270,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2578125,
+          "logprob": -15.2265625,
           "text": "<image>"
         },
         {
@@ -34290,17 +34290,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0078125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3828125,
+          "logprob": -15.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
@@ -34310,7 +34310,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0703125,
+          "logprob": -11.0625,
           "text": "<image>"
         },
         {
@@ -34320,7 +34320,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -34330,27 +34330,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.15625,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
@@ -34365,22 +34365,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
@@ -34390,22 +34390,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
@@ -34415,7 +34415,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
@@ -34425,127 +34425,127 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.296875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2109375,
+          "logprob": -15.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
@@ -34555,7 +34555,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.5234375,
           "text": "<image>"
         },
         {
@@ -34565,27 +34565,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
@@ -34595,22 +34595,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0390625,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
@@ -34620,12 +34620,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
@@ -34635,32 +34635,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4765625,
+          "logprob": -14.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -13.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
@@ -34670,22 +34670,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.640625,
+          "logprob": -14.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
@@ -34695,7 +34695,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.4375,
+          "logprob": -15.4296875,
           "text": "<image>"
         },
         {
@@ -34705,32 +34705,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.21875,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
@@ -34740,17 +34740,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
@@ -34760,22 +34760,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8203125,
           "text": "<image>"
         },
         {
@@ -34785,7 +34785,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -34795,7 +34795,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1328125,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
@@ -34820,7 +34820,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
@@ -34830,7 +34830,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.15625,
+          "logprob": -11.1484375,
           "text": "<image>"
         },
         {
@@ -34840,17 +34840,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.09375,
+          "logprob": -13.0859375,
           "text": "<image>"
         },
         {
@@ -34860,22 +34860,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9609375,
+          "logprob": -13.9765625,
           "text": "<image>"
         },
         {
@@ -34885,7 +34885,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
@@ -34895,22 +34895,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.234375,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
@@ -34920,127 +34920,127 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.640625,
+          "logprob": -15.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.609375,
+          "logprob": -13.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
@@ -35050,7 +35050,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
@@ -35070,12 +35070,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -35085,17 +35085,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6796875,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -35110,27 +35110,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
@@ -35140,17 +35140,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5078125,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -35160,32 +35160,32 @@
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.40625,
+          "logprob": -14.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.296875,
+          "logprob": -15.3203125,
           "text": "<image>"
         },
         {
@@ -35195,12 +35195,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
@@ -35210,7 +35210,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
@@ -35220,12 +35220,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
@@ -35235,12 +35235,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
@@ -35250,52 +35250,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2734375,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
@@ -35305,37 +35305,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.546875,
+          "logprob": -14.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -35345,12 +35345,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.515625,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.078125,
           "text": "<image>"
         },
         {
@@ -35360,72 +35360,72 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5703125,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.125,
+          "logprob": -14.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
@@ -35435,97 +35435,97 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4140625,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1328125,
           "text": "<image>"
         },
         {
@@ -35535,27 +35535,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
@@ -35570,27 +35570,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.78125,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
@@ -35600,17 +35600,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
@@ -35620,37 +35620,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -35660,112 +35660,112 @@
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5390625,
+          "logprob": -15.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2109375,
+          "logprob": -14.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.390625,
+          "logprob": -14.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.984375,
+          "logprob": -14.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -35775,27 +35775,27 @@
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
@@ -35815,7 +35815,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5546875,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
@@ -35825,72 +35825,72 @@
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.171875,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
@@ -35900,17 +35900,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0625,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
@@ -35920,12 +35920,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
@@ -35935,42 +35935,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -35980,12 +35980,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -14.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
@@ -35995,17 +35995,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9765625,
+          "logprob": -14.96875,
           "text": "<image>"
         },
         {
@@ -36015,32 +36015,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.703125,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -36050,17 +36050,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -36070,7 +36070,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
@@ -36080,67 +36080,67 @@
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9375,
+          "logprob": -14.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4609375,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -36150,7 +36150,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7578125,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
@@ -36160,12 +36160,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.140625,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
@@ -36175,7 +36175,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
@@ -36185,27 +36185,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
@@ -36215,17 +36215,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -36235,22 +36235,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -36260,32 +36260,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
@@ -36300,7 +36300,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.109375,
+          "logprob": -11.1015625,
           "text": "<image>"
         },
         {
@@ -36320,67 +36320,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
@@ -36390,17 +36390,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.765625,
+          "logprob": -13.7578125,
           "text": "<image>"
         },
         {
@@ -36410,37 +36410,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.53125,
+          "logprob": -14.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
@@ -36455,62 +36455,62 @@
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.21875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
@@ -36525,47 +36525,47 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.03125,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
@@ -36590,47 +36590,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.765625,
+          "logprob": -15.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9140625,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -36640,92 +36640,92 @@
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.25,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
@@ -36735,12 +36735,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -36775,22 +36775,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6640625,
+          "logprob": -13.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -36815,57 +36815,57 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
@@ -36875,7 +36875,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
@@ -36885,7 +36885,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -36900,37 +36900,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5625,
+          "logprob": -14.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7109375,
+          "logprob": -10.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -36945,32 +36945,32 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.296875,
+          "logprob": -13.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -36985,27 +36985,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -37015,27 +37015,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
@@ -37045,7 +37045,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
@@ -37055,7 +37055,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -37070,37 +37070,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.03125,
+          "logprob": -10.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.671875,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
@@ -37110,32 +37110,32 @@
         },
         {
           "id": 32000,
-          "logprob": -15.1796875,
+          "logprob": -15.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6484375,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -37145,67 +37145,67 @@
         },
         {
           "id": 32000,
-          "logprob": -14.390625,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.453125,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7890625,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3828125,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -37215,12 +37215,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
@@ -37230,17 +37230,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.015625,
+          "logprob": -13.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -37250,7 +37250,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
@@ -37265,7 +37265,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
@@ -37275,7 +37275,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -37285,7 +37285,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
@@ -37295,12 +37295,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.21875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
@@ -37310,12 +37310,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
@@ -37325,32 +37325,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6640625,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -15.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -37360,7 +37360,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -37370,27 +37370,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1015625,
+          "logprob": -14.109375,
           "text": "<image>"
         },
         {
@@ -37400,17 +37400,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -37420,27 +37420,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.21875,
           "text": "<image>"
         },
         {
@@ -37450,52 +37450,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.1875,
+          "logprob": -15.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.515625,
+          "logprob": -13.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -37505,12 +37505,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5546875,
+          "logprob": -14.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.4140625,
           "text": "<image>"
         },
         {
@@ -37520,7 +37520,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
@@ -37535,7 +37535,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -37555,27 +37555,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1875,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.359375,
+          "logprob": -13.3359375,
           "text": "<image>"
         },
         {
@@ -37585,7 +37585,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -37600,32 +37600,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
@@ -37635,42 +37635,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
@@ -37680,17 +37680,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -37705,52 +37705,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
@@ -37760,22 +37760,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1640625,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
@@ -37785,7 +37785,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
@@ -37795,12 +37795,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
@@ -37820,12 +37820,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
@@ -37835,22 +37835,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.15625,
+          "logprob": -14.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
@@ -37860,47 +37860,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6015625,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.078125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
@@ -37910,17 +37910,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
@@ -37930,7 +37930,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -37945,42 +37945,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
@@ -37990,17 +37990,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7890625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
@@ -38010,7 +38010,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
@@ -38025,17 +38025,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -38045,17 +38045,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.109375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
@@ -38065,27 +38065,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.28125,
+          "logprob": -15.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
@@ -38095,12 +38095,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0078125,
+          "logprob": -14.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
@@ -38115,32 +38115,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9765625,
+          "logprob": -13.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.890625,
+          "logprob": -14.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -38150,7 +38150,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
@@ -38160,7 +38160,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -38170,7 +38170,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -38180,7 +38180,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
@@ -38190,7 +38190,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -38200,7 +38200,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -38210,22 +38210,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
@@ -38235,7 +38235,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.765625,
           "text": "<image>"
         },
         {
@@ -38250,37 +38250,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
@@ -38290,22 +38290,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
@@ -38315,112 +38315,112 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.96875,
+          "logprob": -15.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5859375,
+          "logprob": -10.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0078125,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3359375,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -38440,22 +38440,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
@@ -38465,12 +38465,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
@@ -38480,7 +38480,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.9140625,
           "text": "<image>"
         },
         {
@@ -38500,17 +38500,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
@@ -38520,17 +38520,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.140625,
+          "logprob": -11.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.0859375,
           "text": "<image>"
         },
         {
@@ -38540,12 +38540,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.578125,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -38560,22 +38560,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
@@ -38590,7 +38590,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.5703125,
           "text": "<image>"
         },
         {
@@ -38600,37 +38600,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1796875,
+          "logprob": -11.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
@@ -38640,17 +38640,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3359375,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
@@ -38660,17 +38660,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6875,
+          "logprob": -14.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -38695,12 +38695,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -38715,27 +38715,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.875,
           "text": "<image>"
         },
         {
@@ -38755,12 +38755,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
@@ -38790,22 +38790,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
@@ -38815,62 +38815,62 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.625,
+          "logprob": -9.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8515625,
+          "logprob": -10.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.015625,
+          "logprob": -14.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -38880,22 +38880,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
@@ -38905,7 +38905,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.21875,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
@@ -38915,7 +38915,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
@@ -38925,12 +38925,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6953125,
+          "logprob": -15.671875,
           "text": "<image>"
         },
         {
@@ -38945,7 +38945,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -38955,22 +38955,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9453125,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
@@ -38985,7 +38985,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
@@ -38995,7 +38995,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
@@ -39010,7 +39010,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -39025,27 +39025,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
@@ -39055,67 +39055,67 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1796875,
+          "logprob": -14.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.046875,
+          "logprob": -17.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -15.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.625,
+          "logprob": -10.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
@@ -39125,37 +39125,37 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9296875,
+          "logprob": -10.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
@@ -39165,27 +39165,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2421875,
+          "logprob": -15.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
@@ -39195,7 +39195,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -39215,32 +39215,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
@@ -39250,32 +39250,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.125,
           "text": "<image>"
         },
         {
@@ -39285,67 +39285,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5546875,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3984375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8203125,
           "text": "<image>"
         },
         {
@@ -39355,37 +39355,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.984375,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
@@ -39395,32 +39395,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8125,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.5,
+          "logprob": -16.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1171875,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
@@ -39430,67 +39430,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6015625,
+          "logprob": -10.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
@@ -39500,7 +39500,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.921875,
+          "logprob": -13.90625,
           "text": "<image>"
         },
         {
@@ -39515,12 +39515,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
@@ -39530,7 +39530,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
@@ -39545,37 +39545,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -13.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.125,
+          "logprob": -15.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.046875,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -39590,62 +39590,62 @@
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -14.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.890625,
+          "logprob": -13.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3203125,
+          "logprob": -15.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
@@ -39655,52 +39655,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9609375,
+          "logprob": -14.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.171875,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -39730,17 +39730,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3125,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
@@ -39750,27 +39750,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.515625,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -39780,42 +39780,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0078125,
+          "logprob": -14.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.21875,
+          "logprob": -15.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.84375,
+          "logprob": -17.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.515625,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
@@ -39825,17 +39825,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4609375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.84375,
+          "logprob": -14.828125,
           "text": "<image>"
         },
         {
@@ -39845,22 +39845,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
@@ -39870,37 +39870,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.890625,
+          "logprob": -15.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
@@ -39910,7 +39910,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
@@ -39920,22 +39920,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
@@ -39945,12 +39945,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.734375,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -39960,12 +39960,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
@@ -39975,12 +39975,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
@@ -40000,22 +40000,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
@@ -40035,7 +40035,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
@@ -40045,27 +40045,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.328125,
+          "logprob": -15.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
@@ -40075,17 +40075,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.421875,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
@@ -40095,12 +40095,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -40115,12 +40115,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.375,
+          "logprob": -10.3828125,
           "text": "<image>"
         },
         {
@@ -40130,7 +40130,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
@@ -40140,7 +40140,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -40150,12 +40150,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
@@ -40165,12 +40165,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -40185,7 +40185,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
@@ -40195,17 +40195,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.59375,
+          "logprob": -16.578125,
           "text": "<image>"
         },
         {
@@ -40215,7 +40215,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -40230,22 +40230,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -40255,12 +40255,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4375,
           "text": "<image>"
         },
         {
@@ -40270,22 +40270,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.8203125,
+          "logprob": -15.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
@@ -40295,42 +40295,42 @@
         },
         {
           "id": 32000,
-          "logprob": -10.109375,
+          "logprob": -10.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.015625,
+          "logprob": -16.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.703125,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
@@ -40340,32 +40340,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6640625,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6640625,
+          "logprob": -15.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.046875,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
@@ -40375,7 +40375,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.953125,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -40390,17 +40390,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
@@ -40410,77 +40410,77 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
@@ -40490,12 +40490,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.734375,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -40505,12 +40505,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -40520,17 +40520,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
@@ -40540,7 +40540,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.6953125,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
@@ -40550,47 +40550,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4140625,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9453125,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -40600,77 +40600,77 @@
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.34375,
+          "logprob": -17.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1953125,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -40680,7 +40680,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -40690,17 +40690,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
@@ -40710,17 +40710,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
@@ -40730,17 +40730,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.6171875,
           "text": "<image>"
         },
         {
@@ -40750,52 +40750,52 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -40805,7 +40805,7 @@
         },
         {
           "id": 32000,
-          "logprob": -17.109375,
+          "logprob": -17.09375,
           "text": "<image>"
         },
         {
@@ -40815,17 +40815,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
@@ -40835,47 +40835,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.09375,
+          "logprob": -15.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.8515625,
+          "logprob": -15.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
@@ -40885,12 +40885,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -40900,12 +40900,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -40915,12 +40915,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7109375,
+          "logprob": -12.6796875,
           "text": "<image>"
         },
         {
@@ -40930,7 +40930,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
@@ -40940,22 +40940,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2265625,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9296875,
+          "logprob": -10.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.53125,
           "text": "<image>"
         },
         {
@@ -40965,12 +40965,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.953125,
+          "logprob": -17.0,
           "text": "<image>"
         },
         {
@@ -40980,12 +40980,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -41000,12 +41000,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -41015,12 +41015,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.421875,
           "text": "<image>"
         },
         {
@@ -41035,7 +41035,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -41045,17 +41045,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.234375,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -41065,37 +41065,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.203125,
+          "logprob": -13.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.078125,
+          "logprob": -17.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.25,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -41105,17 +41105,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.0625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
@@ -41125,17 +41125,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -41150,22 +41150,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
@@ -41175,7 +41175,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -41185,7 +41185,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.328125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
@@ -41195,22 +41195,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.84375,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2109375,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9375,
           "text": "<image>"
         },
         {
@@ -41220,27 +41220,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6796875,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.65625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.0703125,
           "text": "<image>"
         },
         {
@@ -41260,7 +41260,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
@@ -41275,22 +41275,22 @@
         },
         {
           "id": 32000,
-          "logprob": -16.109375,
+          "logprob": -16.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -41300,57 +41300,57 @@
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.21875,
+          "logprob": -15.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8984375,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.4375,
+          "logprob": -17.453125,
           "text": "<image>"
         },
         {
@@ -41365,7 +41365,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
@@ -41380,12 +41380,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2421875,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -41400,57 +41400,57 @@
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7421875,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.71875,
+          "logprob": -14.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0078125,
+          "logprob": -11.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.46875,
+          "logprob": -16.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
@@ -41460,7 +41460,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -41475,12 +41475,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.359375,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
@@ -41495,27 +41495,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
@@ -41530,7 +41530,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.046875,
           "text": "<image>"
         },
         {
@@ -41540,7 +41540,7 @@
         },
         {
           "id": 32000,
-          "logprob": -17.21875,
+          "logprob": -17.203125,
           "text": "<image>"
         },
         {
@@ -41550,17 +41550,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6171875,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -41570,12 +41570,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.03125,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -41585,7 +41585,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.890625,
+          "logprob": -15.9296875,
           "text": "<image>"
         },
         {
@@ -41610,22 +41610,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7578125,
+          "logprob": -14.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -41635,7 +41635,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.7265625,
+          "logprob": -10.734375,
           "text": "<image>"
         },
         {
@@ -41645,22 +41645,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
@@ -41670,7 +41670,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
@@ -41685,7 +41685,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
@@ -41695,7 +41695,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.265625,
+          "logprob": -15.2578125,
           "text": "<image>"
         },
         {
@@ -41710,7 +41710,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
@@ -41725,12 +41725,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
@@ -41740,7 +41740,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
@@ -41750,27 +41750,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3828125,
+          "logprob": -14.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -41780,27 +41780,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.25,
+          "logprob": -16.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.8125,
+          "logprob": -16.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.359375,
           "text": "<image>"
         },
         {
@@ -41810,12 +41810,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
@@ -41830,77 +41830,77 @@
         },
         {
           "id": 32000,
-          "logprob": -17.5625,
+          "logprob": -17.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.75,
+          "logprob": -16.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
@@ -41910,7 +41910,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
@@ -41930,7 +41930,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
@@ -41940,12 +41940,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -41955,22 +41955,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
@@ -41990,12 +41990,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
@@ -42005,27 +42005,27 @@
         },
         {
           "id": 32000,
-          "logprob": -16.640625,
+          "logprob": -16.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -42035,12 +42035,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
@@ -42050,12 +42050,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5390625,
+          "logprob": -14.546875,
           "text": "<image>"
         },
         {
@@ -42065,12 +42065,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6796875,
+          "logprob": -13.65625,
           "text": "<image>"
         },
         {
@@ -42080,77 +42080,77 @@
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.625,
+          "logprob": -17.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.75,
+          "logprob": -17.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5703125,
+          "logprob": -10.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
@@ -42160,42 +42160,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3046875,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9375,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -42205,27 +42205,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -42235,42 +42235,42 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0546875,
+          "logprob": -15.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.84375,
+          "logprob": -16.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9296875,
+          "logprob": -14.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
@@ -42280,27 +42280,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.109375,
+          "logprob": -10.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -42310,12 +42310,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
@@ -42325,7 +42325,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
@@ -42335,12 +42335,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8671875,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -42355,32 +42355,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.453125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.890625,
+          "logprob": -16.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -42395,37 +42395,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.6875,
+          "logprob": -17.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
@@ -42435,22 +42435,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.203125,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
@@ -42460,7 +42460,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -42470,12 +42470,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
@@ -42485,12 +42485,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -42500,22 +42500,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.46875,
+          "logprob": -16.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
@@ -42525,22 +42525,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.109375,
+          "logprob": -13.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.34375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
@@ -42550,17 +42550,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -42570,22 +42570,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
@@ -42595,22 +42595,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6484375,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -42620,7 +42620,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.2265625,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
@@ -42630,7 +42630,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
@@ -42645,52 +42645,52 @@
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5,
+          "logprob": -15.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7734375,
+          "logprob": -13.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -42700,27 +42700,27 @@
         },
         {
           "id": 32000,
-          "logprob": -16.390625,
+          "logprob": -16.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8984375,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9921875,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
@@ -42735,7 +42735,7 @@
         },
         {
           "id": 32000,
-          "logprob": -16.046875,
+          "logprob": -16.03125,
           "text": "<image>"
         },
         {
@@ -42745,12 +42745,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6484375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3203125,
+          "logprob": -15.3359375,
           "text": "<image>"
         },
         {
@@ -42760,42 +42760,42 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.4765625,
+          "logprob": -16.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
@@ -42805,12 +42805,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.8359375,
+          "logprob": -15.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.65625,
+          "logprob": -17.703125,
           "text": "<image>"
         },
         {
@@ -42820,22 +42820,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -42845,17 +42845,17 @@
         },
         {
           "id": 32000,
-          "logprob": -16.6875,
+          "logprob": -16.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.90625,
           "text": "<image>"
         },
         {
@@ -42865,12 +42865,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.203125,
+          "logprob": -15.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
@@ -42880,27 +42880,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.8125,
+          "logprob": -16.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8671875,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -42910,12 +42910,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -42925,7 +42925,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.765625,
+          "logprob": -13.7578125,
           "text": "<image>"
         },
         {
@@ -42940,17 +42940,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -42960,22 +42960,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -42985,12 +42985,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -43000,27 +43000,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.703125,
+          "logprob": -18.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6484375,
+          "logprob": -15.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.703125,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -43035,17 +43035,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
@@ -43055,42 +43055,42 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3515625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.15625,
+          "logprob": -14.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -13.8515625,
           "text": "<image>"
         },
         {
@@ -43100,12 +43100,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
@@ -43115,12 +43115,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -43130,12 +43130,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
@@ -43145,27 +43145,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
@@ -43175,27 +43175,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.671875,
+          "logprob": -16.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
@@ -43205,7 +43205,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.9453125,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
@@ -43215,82 +43215,82 @@
         },
         {
           "id": 32000,
-          "logprob": -15.203125,
+          "logprob": -15.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.796875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.484375,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6328125,
+          "logprob": -14.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.75,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3125,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -43300,12 +43300,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.484375,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
@@ -43320,17 +43320,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.53125,
+          "logprob": -16.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
@@ -43340,17 +43340,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.03125,
+          "logprob": -15.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.484375,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
@@ -43365,37 +43365,37 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.0,
+          "logprob": -17.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
@@ -43410,12 +43410,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
@@ -43425,27 +43425,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.125,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9140625,
+          "logprob": -14.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.546875,
+          "logprob": -15.6875,
           "text": "<image>"
         },
         {
@@ -43455,22 +43455,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.109375,
+          "logprob": -14.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -43480,7 +43480,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7734375,
+          "logprob": -14.765625,
           "text": "<image>"
         },
         {
@@ -43490,12 +43490,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -43505,17 +43505,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
@@ -43530,27 +43530,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -43560,47 +43560,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -19.65625,
+          "logprob": -19.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
@@ -43610,37 +43610,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.859375,
+          "logprob": -13.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
@@ -43650,47 +43650,47 @@
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8984375,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.234375,
+          "logprob": -16.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2109375,
+          "logprob": -15.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5703125,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
@@ -43700,47 +43700,47 @@
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -14.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
@@ -43750,87 +43750,87 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0703125,
+          "logprob": -15.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0390625,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5546875,
+          "logprob": -12.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.375,
+          "logprob": -16.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8828125,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6171875,
+          "logprob": -14.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
@@ -43845,22 +43845,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5625,
+          "logprob": -14.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
@@ -43870,62 +43870,62 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.953125,
+          "logprob": -14.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6484375,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
@@ -43940,17 +43940,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.1953125,
+          "logprob": -15.1875,
           "text": "<image>"
         },
         {
@@ -43960,17 +43960,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0546875,
+          "logprob": -15.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.59375,
+          "logprob": -15.5703125,
           "text": "<image>"
         },
         {
@@ -43980,22 +43980,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.90625,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.671875,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
@@ -44005,17 +44005,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.2265625,
+          "logprob": -15.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6875,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
@@ -44025,37 +44025,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.515625,
+          "logprob": -16.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3984375,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
@@ -44070,17 +44070,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
@@ -44090,67 +44090,67 @@
         },
         {
           "id": 32000,
-          "logprob": -14.734375,
+          "logprob": -14.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3984375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.578125,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.578125,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.421875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.34375,
+          "logprob": -14.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.890625,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
@@ -44160,7 +44160,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
@@ -44170,12 +44170,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
@@ -44190,17 +44190,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5078125,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
@@ -44210,22 +44210,22 @@
         },
         {
           "id": 32000,
-          "logprob": -17.3125,
+          "logprob": -16.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5234375,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.625,
+          "logprob": -17.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -44235,17 +44235,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.9140625,
+          "logprob": -16.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.65625,
+          "logprob": -16.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -44255,17 +44255,17 @@
         },
         {
           "id": 368,
-          "logprob": -0.19726562,
+          "logprob": -0.19604492,
           "text": "you"
         },
         {
           "id": 1912,
-          "logprob": -1.4990234,
+          "logprob": -1.5058594,
           "text": "tell"
         },
         {
           "id": 528,
-          "logprob": -0.31152344,
+          "logprob": -0.31030273,
           "text": "me"
         },
         {
@@ -44280,22 +44280,22 @@
         },
         {
           "id": 2485,
-          "logprob": -0.9941406,
+          "logprob": -0.9975586,
           "text": "short"
         },
         {
           "id": 2838,
-          "logprob": -0.46118164,
+          "logprob": -0.4633789,
           "text": "story"
         },
         {
           "id": 2818,
-          "logprob": -3.3183594,
+          "logprob": -3.3144531,
           "text": "based"
         },
         {
           "id": 356,
-          "logprob": -0.029129028,
+          "logprob": -0.029037476,
           "text": "on"
         },
         {
@@ -44305,12 +44305,12 @@
         },
         {
           "id": 3469,
-          "logprob": -0.29052734,
+          "logprob": -0.2890625,
           "text": "image"
         },
         {
           "id": 28804,
-          "logprob": -0.43188477,
+          "logprob": -0.42895508,
           "text": "?"
         }
       ],
@@ -44318,61 +44318,61 @@
       "tokens": [
         {
           "id": 13,
-          "logprob": -0.0076828003,
+          "logprob": -0.007621765,
           "special": false,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.20092773,
+          "logprob": -0.20275879,
           "special": false,
           "text": "\n"
         },
         {
           "id": 16114,
-          "logprob": -1.2587891,
+          "logprob": -1.2578125,
           "special": false,
           "text": "Once"
         },
         {
           "id": 3714,
-          "logprob": -0.20861816,
+          "logprob": -0.2084961,
           "special": false,
           "text": " upon"
         },
         {
           "id": 264,
-          "logprob": -0.0017719269,
+          "logprob": -0.0017738342,
           "special": false,
           "text": " a"
         },
         {
           "id": 727,
-          "logprob": -0.011909485,
+          "logprob": -0.011932373,
           "special": false,
           "text": " time"
         },
         {
           "id": 28725,
-          "logprob": -0.17529297,
+          "logprob": -0.17297363,
           "special": false,
           "text": ","
         },
         {
           "id": 736,
-          "logprob": -0.9082031,
+          "logprob": -0.9057617,
           "special": false,
           "text": " there"
         },
         {
           "id": 403,
-          "logprob": -0.057525635,
+          "logprob": -0.05758667,
           "special": false,
           "text": " was"
         },
         {
           "id": 264,
-          "logprob": -0.009651184,
+          "logprob": -0.00970459,
           "special": false,
           "text": " a"
         }
@@ -44394,7 +44394,7 @@
         },
         {
           "id": 1247,
-          "logprob": -2.3886719,
+          "logprob": -2.390625,
           "text": "User"
         },
         {
@@ -44409,12 +44409,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.671875,
+          "logprob": -10.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7109375,
+          "logprob": -15.828125,
           "text": "<image>"
         },
         {
@@ -44424,82 +44424,82 @@
         },
         {
           "id": 32000,
-          "logprob": -10.0234375,
+          "logprob": -10.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1328125,
+          "logprob": -10.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.421875,
+          "logprob": -10.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.59375,
+          "logprob": -15.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1171875,
+          "logprob": -10.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.1640625,
+          "logprob": -10.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.234375,
+          "logprob": -10.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3984375,
+          "logprob": -10.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -13.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.359375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
@@ -44509,12 +44509,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.4765625,
+          "logprob": -10.4453125,
           "text": "<image>"
         },
         {
@@ -44524,22 +44524,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.34375,
+          "logprob": -17.234375,
           "text": "<image>"
         },
         {
@@ -44554,12 +44554,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.390625,
+          "logprob": -17.984375,
           "text": "<image>"
         },
         {
@@ -44569,17 +44569,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.875,
+          "logprob": -9.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7734375,
+          "logprob": -10.7578125,
           "text": "<image>"
         },
         {
@@ -44589,32 +44589,32 @@
         },
         {
           "id": 32000,
-          "logprob": -10.96875,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5078125,
+          "logprob": -10.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
@@ -44624,127 +44624,127 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7421875,
+          "logprob": -10.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3828125,
+          "logprob": -10.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.171875,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.40625,
+          "logprob": -18.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -11.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7109375,
+          "logprob": -15.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2421875,
+          "logprob": -10.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2265625,
+          "logprob": -10.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2578125,
+          "logprob": -10.265625,
           "text": "<image>"
         },
         {
@@ -44754,32 +44754,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7890625,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
@@ -44789,27 +44789,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.4453125,
+          "logprob": -15.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2109375,
+          "logprob": -10.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6796875,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.3671875,
+          "logprob": -10.375,
           "text": "<image>"
         },
         {
@@ -44824,27 +44824,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.96875,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
@@ -44864,27 +44864,27 @@
         },
         {
           "id": 32000,
-          "logprob": -10.703125,
+          "logprob": -10.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.71875,
+          "logprob": -10.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
@@ -44894,12 +44894,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.640625,
+          "logprob": -10.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7109375,
+          "logprob": -10.7265625,
           "text": "<image>"
         },
         {
@@ -44914,17 +44914,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6875,
+          "logprob": -10.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -44934,22 +44934,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2265625,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -44959,7 +44959,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.21875,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
@@ -44969,52 +44969,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.140625,
+          "logprob": -15.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -45024,72 +45024,72 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8828125,
+          "logprob": -10.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1953125,
+          "logprob": -11.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -45099,12 +45099,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
@@ -45119,27 +45119,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2890625,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.7578125,
           "text": "<image>"
         },
         {
@@ -45154,7 +45154,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
@@ -45164,12 +45164,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9765625,
+          "logprob": -17.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -45179,12 +45179,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -45194,12 +45194,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
@@ -45209,67 +45209,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.359375,
+          "logprob": -10.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.015625,
+          "logprob": -11.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7421875,
+          "logprob": -10.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
@@ -45279,17 +45279,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6171875,
+          "logprob": -10.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8359375,
+          "logprob": -10.8203125,
           "text": "<image>"
         },
         {
@@ -45309,67 +45309,67 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.2890625,
+          "logprob": -10.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.234375,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.515625,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1328125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -45379,92 +45379,92 @@
         },
         {
           "id": 32000,
-          "logprob": -10.359375,
+          "logprob": -10.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6171875,
+          "logprob": -10.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1796875,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.203125,
+          "logprob": -14.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.59375,
+          "logprob": -10.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.6015625,
           "text": "<image>"
         },
         {
@@ -45474,32 +45474,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -45509,12 +45509,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
@@ -45524,7 +45524,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
@@ -45534,57 +45534,57 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9921875,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7109375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7890625,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
@@ -45594,17 +45594,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.859375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
@@ -45614,22 +45614,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.171875,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
@@ -45639,42 +45639,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.484375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.921875,
+          "logprob": -14.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.171875,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
@@ -45684,37 +45684,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1875,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.171875,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1953125,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -45724,17 +45724,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
@@ -45744,27 +45744,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
@@ -45774,32 +45774,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -45819,12 +45819,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0234375,
+          "logprob": -15.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
@@ -45834,47 +45834,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8828125,
+          "logprob": -14.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -45884,12 +45884,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -45909,12 +45909,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
@@ -45929,7 +45929,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -45944,47 +45944,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1953125,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6640625,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5703125,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
@@ -45994,57 +45994,57 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7890625,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4296875,
+          "logprob": -14.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
@@ -46054,37 +46054,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -46094,7 +46094,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -46114,12 +46114,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.109375,
+          "logprob": -15.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
@@ -46129,12 +46129,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2578125,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
@@ -46144,7 +46144,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
@@ -46154,22 +46154,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7734375,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -46179,62 +46179,62 @@
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.21875,
+          "logprob": -16.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
@@ -46244,17 +46244,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -46269,17 +46269,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -46289,27 +46289,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.1875,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.421875,
+          "logprob": -14.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -46324,12 +46324,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.859375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -46339,12 +46339,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.953125,
           "text": "<image>"
         },
         {
@@ -46359,47 +46359,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3515625,
+          "logprob": -15.34375,
           "text": "<image>"
         },
         {
@@ -46414,52 +46414,52 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
@@ -46474,27 +46474,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.296875,
           "text": "<image>"
         },
         {
@@ -46504,22 +46504,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -46529,37 +46529,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
@@ -46569,12 +46569,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
@@ -46584,42 +46584,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
@@ -46629,12 +46629,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -46644,52 +46644,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.875,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2890625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
@@ -46699,22 +46699,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -46724,17 +46724,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -46744,27 +46744,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.390625,
+          "logprob": -15.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.140625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.4921875,
+          "logprob": -15.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.921875,
           "text": "<image>"
         },
         {
@@ -46779,7 +46779,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.984375,
+          "logprob": -16.0,
           "text": "<image>"
         },
         {
@@ -46789,37 +46789,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
@@ -46829,12 +46829,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
@@ -46844,27 +46844,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -46874,77 +46874,77 @@
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.078125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
@@ -46959,27 +46959,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.921875,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
@@ -46999,12 +46999,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
@@ -47014,12 +47014,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2109375,
           "text": "<image>"
         },
         {
@@ -47039,17 +47039,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -47059,12 +47059,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
@@ -47079,7 +47079,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
@@ -47094,7 +47094,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
@@ -47109,27 +47109,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1875,
+          "logprob": -13.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.796875,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -47139,27 +47139,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6640625,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.7890625,
+          "logprob": -15.7734375,
           "text": "<image>"
         },
         {
@@ -47169,7 +47169,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
@@ -47179,17 +47179,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.21875,
+          "logprob": -16.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -47199,27 +47199,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
@@ -47229,12 +47229,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
@@ -47244,87 +47244,87 @@
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.15625,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.46875,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.515625,
+          "logprob": -10.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.015625,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.375,
+          "logprob": -16.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
@@ -47334,52 +47334,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2109375,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.9375,
+          "logprob": -9.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2734375,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
@@ -47394,72 +47394,72 @@
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8359375,
+          "logprob": -14.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5859375,
+          "logprob": -10.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5390625,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2578125,
+          "logprob": -15.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1171875,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -47469,7 +47469,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0,
+          "logprob": -11.0078125,
           "text": "<image>"
         },
         {
@@ -47479,22 +47479,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
@@ -47504,7 +47504,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
@@ -47514,17 +47514,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
@@ -47534,12 +47534,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -47549,37 +47549,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.6953125,
+          "logprob": -9.7109375,
           "text": "<image>"
         },
         {
@@ -47589,27 +47589,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3203125,
+          "logprob": -14.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
@@ -47624,7 +47624,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
@@ -47639,22 +47639,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
@@ -47664,7 +47664,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5703125,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
@@ -47679,17 +47679,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
@@ -47699,17 +47699,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.03125,
+          "logprob": -10.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7890625,
+          "logprob": -13.8125,
           "text": "<image>"
         },
         {
@@ -47719,42 +47719,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
@@ -47764,37 +47764,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.546875,
+          "logprob": -16.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4765625,
+          "logprob": -14.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
@@ -47804,7 +47804,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -47814,12 +47814,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6171875,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
@@ -47829,117 +47829,117 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7890625,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0234375,
+          "logprob": -15.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9453125,
+          "logprob": -14.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
@@ -47949,37 +47949,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9609375,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -12.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.140625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4609375,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.609375,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
@@ -47989,7 +47989,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
@@ -48014,12 +48014,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.046875,
+          "logprob": -17.0625,
           "text": "<image>"
         },
         {
@@ -48029,12 +48029,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
@@ -48044,42 +48044,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9375,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.46875,
+          "logprob": -10.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.140625,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
@@ -48089,52 +48089,52 @@
         },
         {
           "id": 32000,
-          "logprob": -16.90625,
+          "logprob": -16.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1328125,
+          "logprob": -14.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -48144,17 +48144,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.09375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
@@ -48164,32 +48164,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8359375,
+          "logprob": -14.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1640625,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.4921875,
           "text": "<image>"
         },
         {
@@ -48199,7 +48199,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
@@ -48209,32 +48209,32 @@
         },
         {
           "id": 32000,
-          "logprob": -16.328125,
+          "logprob": -16.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9921875,
+          "logprob": -14.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.40625,
+          "logprob": -15.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
@@ -48249,7 +48249,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
@@ -48264,82 +48264,82 @@
         },
         {
           "id": 32000,
-          "logprob": -16.890625,
+          "logprob": -16.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.75,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0546875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
@@ -48349,77 +48349,77 @@
         },
         {
           "id": 32000,
-          "logprob": -13.640625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.15625,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.25,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6484375,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0078125,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.9765625,
           "text": "<image>"
         },
         {
@@ -48429,17 +48429,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
@@ -48454,12 +48454,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7578125,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
@@ -48469,12 +48469,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -48489,17 +48489,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
@@ -48509,37 +48509,37 @@
         },
         {
           "id": 32000,
-          "logprob": -15.953125,
+          "logprob": -15.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
@@ -48549,47 +48549,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
@@ -48599,17 +48599,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0703125,
+          "logprob": -15.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
@@ -48624,27 +48624,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3515625,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
@@ -48669,12 +48669,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -48684,22 +48684,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.078125,
+          "logprob": -14.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.03125,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -48709,42 +48709,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.46875,
+          "logprob": -14.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.703125,
+          "logprob": -14.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.296875,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8828125,
+          "logprob": -14.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1796875,
+          "logprob": -14.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2265625,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -48759,12 +48759,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7109375,
           "text": "<image>"
         },
         {
@@ -48774,32 +48774,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
@@ -48809,12 +48809,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -48824,37 +48824,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4609375,
+          "logprob": -14.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.65625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
@@ -48874,22 +48874,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -48899,32 +48899,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -48934,27 +48934,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -48964,12 +48964,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0703125,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0,
+          "logprob": -13.984375,
           "text": "<image>"
         },
         {
@@ -48979,7 +48979,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.2109375,
           "text": "<image>"
         },
         {
@@ -48989,42 +48989,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.328125,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
@@ -49039,17 +49039,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -49064,17 +49064,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2578125,
+          "logprob": -15.2265625,
           "text": "<image>"
         },
         {
@@ -49084,17 +49084,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0078125,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3828125,
+          "logprob": -15.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
@@ -49104,7 +49104,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0703125,
+          "logprob": -11.0625,
           "text": "<image>"
         },
         {
@@ -49114,7 +49114,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -49124,27 +49124,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.15625,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
@@ -49159,22 +49159,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
@@ -49184,22 +49184,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
@@ -49209,7 +49209,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
@@ -49219,127 +49219,127 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.296875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2109375,
+          "logprob": -15.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -14.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -14.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
@@ -49349,7 +49349,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.5234375,
           "text": "<image>"
         },
         {
@@ -49359,27 +49359,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
@@ -49389,22 +49389,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0390625,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.828125,
+          "logprob": -13.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
@@ -49414,12 +49414,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
@@ -49429,32 +49429,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4765625,
+          "logprob": -14.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6875,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -13.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
@@ -49464,22 +49464,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.640625,
+          "logprob": -14.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
@@ -49489,7 +49489,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.4375,
+          "logprob": -15.4296875,
           "text": "<image>"
         },
         {
@@ -49499,32 +49499,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.21875,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.828125,
           "text": "<image>"
         },
         {
@@ -49534,17 +49534,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
@@ -49554,22 +49554,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8203125,
           "text": "<image>"
         },
         {
@@ -49579,7 +49579,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -49589,7 +49589,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1328125,
+          "logprob": -11.140625,
           "text": "<image>"
         },
         {
@@ -49614,7 +49614,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
@@ -49624,7 +49624,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.15625,
+          "logprob": -11.1484375,
           "text": "<image>"
         },
         {
@@ -49634,17 +49634,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.09375,
+          "logprob": -13.0859375,
           "text": "<image>"
         },
         {
@@ -49654,22 +49654,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9609375,
+          "logprob": -13.9765625,
           "text": "<image>"
         },
         {
@@ -49679,7 +49679,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
@@ -49689,22 +49689,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.234375,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
@@ -49714,127 +49714,127 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2734375,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -13.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.640625,
+          "logprob": -15.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.609375,
+          "logprob": -13.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
@@ -49844,7 +49844,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
@@ -49864,12 +49864,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -49879,17 +49879,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6796875,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -49904,27 +49904,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
@@ -49934,17 +49934,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5078125,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -49954,32 +49954,32 @@
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.40625,
+          "logprob": -14.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.296875,
+          "logprob": -15.3203125,
           "text": "<image>"
         },
         {
@@ -49989,12 +49989,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
@@ -50004,7 +50004,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
@@ -50014,12 +50014,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
@@ -50029,12 +50029,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
@@ -50044,52 +50044,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2734375,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
@@ -50099,37 +50099,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.546875,
+          "logprob": -14.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -50139,12 +50139,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.515625,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.078125,
           "text": "<image>"
         },
         {
@@ -50154,72 +50154,72 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5703125,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.125,
+          "logprob": -14.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
@@ -50229,97 +50229,97 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6953125,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.375,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4140625,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1328125,
           "text": "<image>"
         },
         {
@@ -50329,27 +50329,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
@@ -50364,27 +50364,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.78125,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
@@ -50394,17 +50394,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.65625,
+          "logprob": -13.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
@@ -50414,37 +50414,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8671875,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -50454,112 +50454,112 @@
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5390625,
+          "logprob": -15.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2109375,
+          "logprob": -14.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.390625,
+          "logprob": -14.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.984375,
+          "logprob": -14.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3984375,
+          "logprob": -11.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.875,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
@@ -50569,27 +50569,27 @@
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.3671875,
           "text": "<image>"
         },
         {
@@ -50609,7 +50609,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5546875,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
@@ -50619,72 +50619,72 @@
         },
         {
           "id": 32000,
-          "logprob": -10.40625,
+          "logprob": -10.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.171875,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0234375,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
@@ -50694,17 +50694,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0625,
+          "logprob": -14.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
@@ -50714,12 +50714,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
@@ -50729,42 +50729,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.375,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -50774,12 +50774,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -14.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
@@ -50789,17 +50789,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9765625,
+          "logprob": -14.96875,
           "text": "<image>"
         },
         {
@@ -50809,32 +50809,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.703125,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -50844,17 +50844,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
@@ -50864,7 +50864,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
@@ -50874,67 +50874,67 @@
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9375,
+          "logprob": -14.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4609375,
+          "logprob": -14.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2890625,
+          "logprob": -13.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -50944,7 +50944,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.7578125,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
@@ -50954,12 +50954,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.140625,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
@@ -50969,7 +50969,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
@@ -50979,27 +50979,27 @@
         },
         {
           "id": 32000,
-          "logprob": -15.015625,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
@@ -51009,17 +51009,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -51029,22 +51029,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1015625,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9921875,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -51054,32 +51054,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
@@ -51094,7 +51094,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.109375,
+          "logprob": -11.1015625,
           "text": "<image>"
         },
         {
@@ -51114,67 +51114,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4140625,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
@@ -51184,17 +51184,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.765625,
+          "logprob": -13.7578125,
           "text": "<image>"
         },
         {
@@ -51204,37 +51204,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.53125,
+          "logprob": -14.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.359375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
@@ -51249,62 +51249,62 @@
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.21875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
@@ -51319,47 +51319,47 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.03125,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
@@ -51384,47 +51384,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1640625,
+          "logprob": -13.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.765625,
+          "logprob": -15.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9140625,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -51434,92 +51434,92 @@
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.25,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
@@ -51529,12 +51529,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -51569,22 +51569,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1953125,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6640625,
+          "logprob": -13.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -51609,57 +51609,57 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7265625,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
@@ -51669,7 +51669,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
@@ -51679,7 +51679,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8359375,
+          "logprob": -13.8203125,
           "text": "<image>"
         },
         {
@@ -51694,37 +51694,37 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5625,
+          "logprob": -14.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7109375,
+          "logprob": -10.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -51739,32 +51739,32 @@
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.296875,
+          "logprob": -13.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
@@ -51779,27 +51779,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.28125,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -51809,27 +51809,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
@@ -51839,7 +51839,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
@@ -51849,7 +51849,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -51864,37 +51864,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.03125,
+          "logprob": -10.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.671875,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
@@ -51904,32 +51904,32 @@
         },
         {
           "id": 32000,
-          "logprob": -15.1796875,
+          "logprob": -15.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6484375,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -51939,67 +51939,67 @@
         },
         {
           "id": 32000,
-          "logprob": -14.390625,
+          "logprob": -14.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.453125,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7890625,
+          "logprob": -13.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3828125,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -52009,12 +52009,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
@@ -52024,17 +52024,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.015625,
+          "logprob": -13.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -52044,7 +52044,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
@@ -52059,7 +52059,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
@@ -52069,7 +52069,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -52079,7 +52079,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
@@ -52089,12 +52089,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.21875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
@@ -52104,12 +52104,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
@@ -52119,32 +52119,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6640625,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -15.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
@@ -52154,7 +52154,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
@@ -52164,27 +52164,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.78125,
+          "logprob": -13.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1015625,
+          "logprob": -14.109375,
           "text": "<image>"
         },
         {
@@ -52194,17 +52194,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -52214,27 +52214,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.21875,
           "text": "<image>"
         },
         {
@@ -52244,52 +52244,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.1875,
+          "logprob": -15.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.515625,
+          "logprob": -13.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -52299,12 +52299,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5546875,
+          "logprob": -14.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.4140625,
           "text": "<image>"
         },
         {
@@ -52314,7 +52314,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
@@ -52329,7 +52329,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2421875,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -52349,27 +52349,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1875,
+          "logprob": -11.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.359375,
+          "logprob": -13.3359375,
           "text": "<image>"
         },
         {
@@ -52379,7 +52379,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -52394,32 +52394,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
@@ -52429,42 +52429,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
@@ -52474,17 +52474,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -52499,52 +52499,52 @@
         },
         {
           "id": 32000,
-          "logprob": -11.625,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6796875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
@@ -52554,22 +52554,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1640625,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
@@ -52579,7 +52579,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
@@ -52589,12 +52589,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
@@ -52614,12 +52614,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
@@ -52629,22 +52629,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.15625,
+          "logprob": -14.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
@@ -52654,47 +52654,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6015625,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.078125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
@@ -52704,17 +52704,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4453125,
+          "logprob": -12.453125,
           "text": "<image>"
         },
         {
@@ -52724,7 +52724,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -52739,42 +52739,42 @@
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
@@ -52784,17 +52784,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7890625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
@@ -52804,7 +52804,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
@@ -52819,17 +52819,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.65625,
+          "logprob": -11.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7890625,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -52839,17 +52839,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.109375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
@@ -52859,27 +52859,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0,
+          "logprob": -13.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.28125,
+          "logprob": -15.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0234375,
+          "logprob": -11.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.0703125,
           "text": "<image>"
         },
         {
@@ -52889,12 +52889,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0078125,
+          "logprob": -14.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
@@ -52909,32 +52909,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.9765625,
+          "logprob": -13.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.890625,
+          "logprob": -14.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -52944,7 +52944,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
@@ -52954,7 +52954,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -52964,7 +52964,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -52974,7 +52974,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
@@ -52984,7 +52984,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
@@ -52994,7 +52994,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -53004,22 +53004,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.125,
+          "logprob": -13.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
@@ -53029,7 +53029,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.765625,
           "text": "<image>"
         },
         {
@@ -53044,37 +53044,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2578125,
           "text": "<image>"
         },
         {
@@ -53084,22 +53084,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
@@ -53109,112 +53109,112 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.96875,
+          "logprob": -15.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9296875,
+          "logprob": -13.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5859375,
+          "logprob": -10.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0078125,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3359375,
+          "logprob": -12.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -53234,22 +53234,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
@@ -53259,12 +53259,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.671875,
           "text": "<image>"
         },
         {
@@ -53274,7 +53274,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.9140625,
           "text": "<image>"
         },
         {
@@ -53294,17 +53294,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.6015625,
           "text": "<image>"
         },
         {
@@ -53314,17 +53314,17 @@
         },
         {
           "id": 32000,
-          "logprob": -10.84375,
+          "logprob": -10.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.140625,
+          "logprob": -11.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.09375,
+          "logprob": -11.0859375,
           "text": "<image>"
         },
         {
@@ -53334,12 +53334,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.578125,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.875,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
@@ -53354,22 +53354,22 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
@@ -53384,7 +53384,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.5703125,
           "text": "<image>"
         },
         {
@@ -53394,37 +53394,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5859375,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -14.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1796875,
+          "logprob": -11.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
@@ -53434,17 +53434,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3359375,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5703125,
+          "logprob": -11.5625,
           "text": "<image>"
         },
         {
@@ -53454,17 +53454,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6875,
+          "logprob": -14.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0625,
+          "logprob": -11.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
@@ -53489,12 +53489,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
@@ -53509,27 +53509,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8671875,
+          "logprob": -10.875,
           "text": "<image>"
         },
         {
@@ -53549,12 +53549,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.796875,
+          "logprob": -10.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.59375,
+          "logprob": -11.6171875,
           "text": "<image>"
         },
         {
@@ -53584,22 +53584,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9375,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
@@ -53609,62 +53609,62 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3671875,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -9.625,
+          "logprob": -9.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8515625,
+          "logprob": -10.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -13.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.015625,
+          "logprob": -14.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -53674,22 +53674,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
@@ -53699,7 +53699,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.21875,
+          "logprob": -11.2265625,
           "text": "<image>"
         },
         {
@@ -53709,7 +53709,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.2890625,
           "text": "<image>"
         },
         {
@@ -53719,12 +53719,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6953125,
+          "logprob": -15.671875,
           "text": "<image>"
         },
         {
@@ -53739,7 +53739,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -53749,22 +53749,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.390625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9453125,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
@@ -53779,7 +53779,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
@@ -53789,7 +53789,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
@@ -53804,7 +53804,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -53819,27 +53819,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.734375,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
@@ -53849,67 +53849,67 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6484375,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1796875,
+          "logprob": -14.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.046875,
+          "logprob": -17.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -15.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.625,
+          "logprob": -10.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
@@ -53919,37 +53919,37 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9296875,
+          "logprob": -10.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9609375,
+          "logprob": -10.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2890625,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
@@ -53959,27 +53959,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.53125,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2421875,
+          "logprob": -15.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
@@ -53989,7 +53989,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -54009,32 +54009,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.609375,
+          "logprob": -11.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.421875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
@@ -54044,32 +54044,32 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9765625,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.125,
           "text": "<image>"
         },
         {
@@ -54079,67 +54079,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4453125,
+          "logprob": -14.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5546875,
+          "logprob": -14.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4765625,
+          "logprob": -13.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3984375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.828125,
+          "logprob": -14.8203125,
           "text": "<image>"
         },
         {
@@ -54149,37 +54149,37 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.984375,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0234375,
+          "logprob": -13.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.46875,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5078125,
           "text": "<image>"
         },
         {
@@ -54189,32 +54189,32 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8125,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.5,
+          "logprob": -16.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1171875,
+          "logprob": -14.1484375,
           "text": "<image>"
         },
         {
@@ -54224,67 +54224,67 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6015625,
+          "logprob": -10.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.25,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
@@ -54294,7 +54294,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.921875,
+          "logprob": -13.90625,
           "text": "<image>"
         },
         {
@@ -54309,12 +54309,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7109375,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
@@ -54324,7 +54324,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
@@ -54339,37 +54339,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -13.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1484375,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.125,
+          "logprob": -15.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.046875,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -54384,62 +54384,62 @@
         },
         {
           "id": 32000,
-          "logprob": -14.09375,
+          "logprob": -14.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.890625,
+          "logprob": -13.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3203125,
+          "logprob": -15.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.25,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
@@ -54449,52 +54449,52 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8671875,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9609375,
+          "logprob": -14.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.59375,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.171875,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -54524,17 +54524,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.78125,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3125,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1171875,
+          "logprob": -11.109375,
           "text": "<image>"
         },
         {
@@ -54544,27 +54544,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.921875,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.515625,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -54574,42 +54574,42 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0078125,
+          "logprob": -14.0234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.21875,
+          "logprob": -15.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.84375,
+          "logprob": -17.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.296875,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.515625,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
@@ -54619,17 +54619,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4609375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.84375,
+          "logprob": -14.828125,
           "text": "<image>"
         },
         {
@@ -54639,22 +54639,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6171875,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
@@ -54664,37 +54664,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.890625,
+          "logprob": -15.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.40625,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1875,
+          "logprob": -14.3046875,
           "text": "<image>"
         },
         {
@@ -54704,7 +54704,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
@@ -54714,22 +54714,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.9453125,
+          "logprob": -13.9921875,
           "text": "<image>"
         },
         {
@@ -54739,12 +54739,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0859375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.734375,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -54754,12 +54754,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
@@ -54769,12 +54769,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
@@ -54794,22 +54794,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4375,
+          "logprob": -12.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
@@ -54829,7 +54829,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2421875,
+          "logprob": -11.234375,
           "text": "<image>"
         },
         {
@@ -54839,27 +54839,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.328125,
+          "logprob": -15.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4453125,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
@@ -54869,17 +54869,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.421875,
+          "logprob": -13.4296875,
           "text": "<image>"
         },
         {
@@ -54889,12 +54889,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.265625,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -54909,12 +54909,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.375,
+          "logprob": -10.3828125,
           "text": "<image>"
         },
         {
@@ -54924,7 +54924,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
@@ -54934,7 +54934,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.15625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -54944,12 +54944,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6328125,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
@@ -54959,12 +54959,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -54979,7 +54979,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.8984375,
+          "logprob": -10.90625,
           "text": "<image>"
         },
         {
@@ -54989,17 +54989,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.40625,
+          "logprob": -12.3828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5390625,
+          "logprob": -11.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.59375,
+          "logprob": -16.578125,
           "text": "<image>"
         },
         {
@@ -55009,7 +55009,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -55024,22 +55024,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -55049,12 +55049,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4375,
           "text": "<image>"
         },
         {
@@ -55064,22 +55064,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0859375,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.8203125,
+          "logprob": -15.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9765625,
           "text": "<image>"
         },
         {
@@ -55089,42 +55089,42 @@
         },
         {
           "id": 32000,
-          "logprob": -10.109375,
+          "logprob": -10.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.640625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.015625,
+          "logprob": -16.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.703125,
+          "logprob": -14.671875,
           "text": "<image>"
         },
         {
@@ -55134,32 +55134,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.6640625,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6640625,
+          "logprob": -15.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9140625,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.046875,
+          "logprob": -11.0546875,
           "text": "<image>"
         },
         {
@@ -55169,7 +55169,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.953125,
+          "logprob": -10.96875,
           "text": "<image>"
         },
         {
@@ -55184,17 +55184,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.34375,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4296875,
+          "logprob": -11.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
@@ -55204,77 +55204,77 @@
         },
         {
           "id": 32000,
-          "logprob": -10.9765625,
+          "logprob": -10.984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.171875,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4296875,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.046875,
+          "logprob": -13.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.1484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5234375,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.203125,
           "text": "<image>"
         },
         {
@@ -55284,12 +55284,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.171875,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.734375,
+          "logprob": -10.7421875,
           "text": "<image>"
         },
         {
@@ -55299,12 +55299,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4921875,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -55314,17 +55314,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
@@ -55334,7 +55334,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.6953125,
+          "logprob": -15.765625,
           "text": "<image>"
         },
         {
@@ -55344,47 +55344,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1328125,
+          "logprob": -13.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.4140625,
+          "logprob": -14.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9453125,
+          "logprob": -10.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7109375,
+          "logprob": -13.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.953125,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -55394,77 +55394,77 @@
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.34375,
+          "logprob": -17.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.46875,
+          "logprob": -12.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3828125,
+          "logprob": -13.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1953125,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.984375,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.2578125,
+          "logprob": -11.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.125,
+          "logprob": -11.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -55474,7 +55474,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
@@ -55484,17 +55484,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
@@ -55504,17 +55504,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8046875,
+          "logprob": -13.765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.25,
           "text": "<image>"
         },
         {
@@ -55524,17 +55524,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.609375,
+          "logprob": -10.6171875,
           "text": "<image>"
         },
         {
@@ -55544,52 +55544,52 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.8203125,
+          "logprob": -10.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.125,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.25,
+          "logprob": -14.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6640625,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -55599,7 +55599,7 @@
         },
         {
           "id": 32000,
-          "logprob": -17.109375,
+          "logprob": -17.09375,
           "text": "<image>"
         },
         {
@@ -55609,17 +55609,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8515625,
+          "logprob": -11.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.625,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5078125,
+          "logprob": -11.4921875,
           "text": "<image>"
         },
         {
@@ -55629,47 +55629,47 @@
         },
         {
           "id": 32000,
-          "logprob": -13.390625,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.09375,
+          "logprob": -15.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.8515625,
+          "logprob": -15.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2578125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.28125,
+          "logprob": -13.3671875,
           "text": "<image>"
         },
         {
@@ -55679,12 +55679,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
@@ -55694,12 +55694,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.890625,
+          "logprob": -10.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -55709,12 +55709,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.53125,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7109375,
+          "logprob": -12.6796875,
           "text": "<image>"
         },
         {
@@ -55724,7 +55724,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.40625,
           "text": "<image>"
         },
         {
@@ -55734,22 +55734,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.2265625,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.9296875,
+          "logprob": -10.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5390625,
+          "logprob": -10.53125,
           "text": "<image>"
         },
         {
@@ -55759,12 +55759,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.953125,
+          "logprob": -17.0,
           "text": "<image>"
         },
         {
@@ -55774,12 +55774,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -55794,12 +55794,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8671875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.484375,
+          "logprob": -11.4765625,
           "text": "<image>"
         },
         {
@@ -55809,12 +55809,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6640625,
+          "logprob": -12.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.421875,
           "text": "<image>"
         },
         {
@@ -55829,7 +55829,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -55839,17 +55839,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0703125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.234375,
+          "logprob": -11.2421875,
           "text": "<image>"
         },
         {
@@ -55859,37 +55859,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.7578125,
+          "logprob": -10.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.53125,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6953125,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.203125,
+          "logprob": -13.1953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.078125,
+          "logprob": -17.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.25,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -55899,17 +55899,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.0625,
+          "logprob": -16.125,
           "text": "<image>"
         },
         {
@@ -55919,17 +55919,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6015625,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -55944,22 +55944,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7734375,
+          "logprob": -11.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9921875,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
@@ -55969,7 +55969,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7578125,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
@@ -55979,7 +55979,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.328125,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
@@ -55989,22 +55989,22 @@
         },
         {
           "id": 32000,
-          "logprob": -13.84375,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2109375,
+          "logprob": -14.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.390625,
+          "logprob": -12.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.953125,
+          "logprob": -13.9375,
           "text": "<image>"
         },
         {
@@ -56014,27 +56014,27 @@
         },
         {
           "id": 32000,
-          "logprob": -14.6796875,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.65625,
+          "logprob": -10.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.0703125,
           "text": "<image>"
         },
         {
@@ -56054,7 +56054,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
@@ -56069,22 +56069,22 @@
         },
         {
           "id": 32000,
-          "logprob": -16.109375,
+          "logprob": -16.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.40625,
+          "logprob": -13.4140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.375,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -56094,57 +56094,57 @@
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0546875,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.21875,
+          "logprob": -15.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5234375,
+          "logprob": -10.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8984375,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8515625,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.4375,
+          "logprob": -17.453125,
           "text": "<image>"
         },
         {
@@ -56159,7 +56159,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
@@ -56174,12 +56174,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.2421875,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -56194,57 +56194,57 @@
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9140625,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7421875,
+          "logprob": -14.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.71875,
+          "logprob": -14.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.015625,
+          "logprob": -13.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0078125,
+          "logprob": -11.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.46875,
+          "logprob": -16.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
@@ -56254,7 +56254,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -56269,12 +56269,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.140625,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.359375,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
@@ -56289,27 +56289,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.578125,
+          "logprob": -11.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8203125,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
@@ -56324,7 +56324,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.0546875,
+          "logprob": -14.046875,
           "text": "<image>"
         },
         {
@@ -56334,7 +56334,7 @@
         },
         {
           "id": 32000,
-          "logprob": -17.21875,
+          "logprob": -17.203125,
           "text": "<image>"
         },
         {
@@ -56344,17 +56344,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6171875,
+          "logprob": -14.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9609375,
+          "logprob": -12.953125,
           "text": "<image>"
         },
         {
@@ -56364,12 +56364,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.03125,
+          "logprob": -15.0234375,
           "text": "<image>"
         },
         {
@@ -56379,7 +56379,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.890625,
+          "logprob": -15.9296875,
           "text": "<image>"
         },
         {
@@ -56404,22 +56404,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.7578125,
+          "logprob": -14.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5,
+          "logprob": -11.484375,
           "text": "<image>"
         },
         {
@@ -56429,7 +56429,7 @@
         },
         {
           "id": 32000,
-          "logprob": -10.7265625,
+          "logprob": -10.734375,
           "text": "<image>"
         },
         {
@@ -56439,22 +56439,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1171875,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
@@ -56464,7 +56464,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
@@ -56479,7 +56479,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.0390625,
+          "logprob": -13.09375,
           "text": "<image>"
         },
         {
@@ -56489,7 +56489,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.265625,
+          "logprob": -15.2578125,
           "text": "<image>"
         },
         {
@@ -56504,7 +56504,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4609375,
           "text": "<image>"
         },
         {
@@ -56519,12 +56519,12 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6796875,
+          "logprob": -10.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
@@ -56534,7 +56534,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.4140625,
           "text": "<image>"
         },
         {
@@ -56544,27 +56544,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8046875,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3828125,
+          "logprob": -14.4609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.890625,
+          "logprob": -12.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.859375,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
@@ -56574,27 +56574,27 @@
         },
         {
           "id": 32000,
-          "logprob": -13.5625,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.25,
+          "logprob": -16.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.8125,
+          "logprob": -16.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.359375,
           "text": "<image>"
         },
         {
@@ -56604,12 +56604,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.671875,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9921875,
+          "logprob": -12.984375,
           "text": "<image>"
         },
         {
@@ -56624,77 +56624,77 @@
         },
         {
           "id": 32000,
-          "logprob": -17.5625,
+          "logprob": -17.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0625,
+          "logprob": -15.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.75,
+          "logprob": -16.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.328125,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.28125,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.46875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3203125,
+          "logprob": -11.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.625,
+          "logprob": -14.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.890625,
+          "logprob": -11.921875,
           "text": "<image>"
         },
         {
@@ -56704,7 +56704,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.21875,
           "text": "<image>"
         },
         {
@@ -56724,7 +56724,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5390625,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
@@ -56734,12 +56734,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.8359375,
           "text": "<image>"
         },
         {
@@ -56749,22 +56749,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.109375,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6328125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
@@ -56784,12 +56784,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.265625,
+          "logprob": -14.25,
           "text": "<image>"
         },
         {
@@ -56799,27 +56799,27 @@
         },
         {
           "id": 32000,
-          "logprob": -16.640625,
+          "logprob": -16.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3125,
           "text": "<image>"
         },
         {
@@ -56829,12 +56829,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3046875,
+          "logprob": -14.2734375,
           "text": "<image>"
         },
         {
@@ -56844,12 +56844,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8046875,
+          "logprob": -12.8125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5390625,
+          "logprob": -14.546875,
           "text": "<image>"
         },
         {
@@ -56859,12 +56859,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6796875,
+          "logprob": -13.65625,
           "text": "<image>"
         },
         {
@@ -56874,77 +56874,77 @@
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.328125,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.625,
+          "logprob": -17.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.75,
+          "logprob": -17.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5234375,
+          "logprob": -11.515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5546875,
+          "logprob": -13.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5859375,
+          "logprob": -12.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3046875,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5703125,
+          "logprob": -10.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2734375,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
@@ -56954,42 +56954,42 @@
         },
         {
           "id": 32000,
-          "logprob": -13.3046875,
+          "logprob": -13.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3359375,
+          "logprob": -14.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5078125,
+          "logprob": -14.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9375,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3515625,
+          "logprob": -11.34375,
           "text": "<image>"
         },
         {
@@ -56999,27 +56999,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8828125,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.90625,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9140625,
           "text": "<image>"
         },
         {
@@ -57029,42 +57029,42 @@
         },
         {
           "id": 32000,
-          "logprob": -15.0546875,
+          "logprob": -15.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.203125,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.84375,
+          "logprob": -16.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9296875,
+          "logprob": -14.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.96875,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.765625,
+          "logprob": -12.796875,
           "text": "<image>"
         },
         {
@@ -57074,27 +57074,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8984375,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.65625,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.515625,
+          "logprob": -14.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.109375,
+          "logprob": -10.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.015625,
+          "logprob": -12.0234375,
           "text": "<image>"
         },
         {
@@ -57104,12 +57104,12 @@
         },
         {
           "id": 32000,
-          "logprob": -14.8125,
+          "logprob": -14.8515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8125,
           "text": "<image>"
         },
         {
@@ -57119,7 +57119,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6484375,
+          "logprob": -12.6640625,
           "text": "<image>"
         },
         {
@@ -57129,12 +57129,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1484375,
+          "logprob": -12.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8671875,
+          "logprob": -14.875,
           "text": "<image>"
         },
         {
@@ -57149,32 +57149,32 @@
         },
         {
           "id": 32000,
-          "logprob": -12.453125,
+          "logprob": -12.4375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.21875,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.890625,
+          "logprob": -16.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -57189,37 +57189,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.3828125,
+          "logprob": -11.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.6875,
+          "logprob": -17.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7265625,
+          "logprob": -13.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.203125,
+          "logprob": -11.2109375,
           "text": "<image>"
         },
         {
@@ -57229,22 +57229,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3125,
+          "logprob": -14.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.203125,
+          "logprob": -13.1875,
           "text": "<image>"
         },
         {
@@ -57254,7 +57254,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8515625,
+          "logprob": -12.859375,
           "text": "<image>"
         },
         {
@@ -57264,12 +57264,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.6328125,
           "text": "<image>"
         },
         {
@@ -57279,12 +57279,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.671875,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
@@ -57294,22 +57294,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.5703125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.46875,
+          "logprob": -16.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.140625,
+          "logprob": -13.15625,
           "text": "<image>"
         },
         {
@@ -57319,22 +57319,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.109375,
+          "logprob": -13.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.34375,
+          "logprob": -14.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.5703125,
           "text": "<image>"
         },
         {
@@ -57344,17 +57344,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.8359375,
+          "logprob": -12.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8984375,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4765625,
+          "logprob": -11.46875,
           "text": "<image>"
         },
         {
@@ -57364,22 +57364,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.765625,
+          "logprob": -11.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8046875,
+          "logprob": -14.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.640625,
           "text": "<image>"
         },
         {
@@ -57389,22 +57389,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6484375,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.59375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
@@ -57414,7 +57414,7 @@
         },
         {
           "id": 32000,
-          "logprob": -15.2265625,
+          "logprob": -15.1953125,
           "text": "<image>"
         },
         {
@@ -57424,7 +57424,7 @@
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
@@ -57439,52 +57439,52 @@
         },
         {
           "id": 32000,
-          "logprob": -13.46875,
+          "logprob": -13.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.5,
+          "logprob": -15.5078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.421875,
+          "logprob": -12.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.7734375,
+          "logprob": -13.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.1875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.046875,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.21875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -12.8984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.75,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -57494,27 +57494,27 @@
         },
         {
           "id": 32000,
-          "logprob": -16.390625,
+          "logprob": -16.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.484375,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.9609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8984375,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9921875,
+          "logprob": -15.0,
           "text": "<image>"
         },
         {
@@ -57529,7 +57529,7 @@
         },
         {
           "id": 32000,
-          "logprob": -16.046875,
+          "logprob": -16.03125,
           "text": "<image>"
         },
         {
@@ -57539,12 +57539,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6484375,
+          "logprob": -13.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.3203125,
+          "logprob": -15.3359375,
           "text": "<image>"
         },
         {
@@ -57554,42 +57554,42 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7109375,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3984375,
+          "logprob": -12.40625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.4765625,
+          "logprob": -16.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5859375,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1015625,
+          "logprob": -12.109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.2109375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
@@ -57599,12 +57599,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.8359375,
+          "logprob": -15.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.65625,
+          "logprob": -17.703125,
           "text": "<image>"
         },
         {
@@ -57614,22 +57614,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.84375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.71875,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.59375,
+          "logprob": -13.53125,
           "text": "<image>"
         },
         {
@@ -57639,17 +57639,17 @@
         },
         {
           "id": 32000,
-          "logprob": -16.6875,
+          "logprob": -16.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.03125,
+          "logprob": -13.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.859375,
+          "logprob": -14.90625,
           "text": "<image>"
         },
         {
@@ -57659,12 +57659,12 @@
         },
         {
           "id": 32000,
-          "logprob": -15.203125,
+          "logprob": -15.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.828125,
           "text": "<image>"
         },
         {
@@ -57674,27 +57674,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.265625,
+          "logprob": -11.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.8125,
+          "logprob": -16.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.0625,
+          "logprob": -13.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.8671875,
+          "logprob": -14.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1171875,
+          "logprob": -12.1328125,
           "text": "<image>"
         },
         {
@@ -57704,12 +57704,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.359375,
+          "logprob": -12.34375,
           "text": "<image>"
         },
         {
@@ -57719,7 +57719,7 @@
         },
         {
           "id": 32000,
-          "logprob": -13.765625,
+          "logprob": -13.7578125,
           "text": "<image>"
         },
         {
@@ -57734,17 +57734,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6015625,
+          "logprob": -12.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6875,
           "text": "<image>"
         },
         {
@@ -57754,22 +57754,22 @@
         },
         {
           "id": 32000,
-          "logprob": -11.640625,
+          "logprob": -11.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.8046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.921875,
+          "logprob": -12.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -57779,12 +57779,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9609375,
+          "logprob": -11.9765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
@@ -57794,27 +57794,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.4140625,
+          "logprob": -12.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.703125,
+          "logprob": -18.65625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.6484375,
+          "logprob": -15.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0703125,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.703125,
+          "logprob": -13.6953125,
           "text": "<image>"
         },
         {
@@ -57829,17 +57829,17 @@
         },
         {
           "id": 32000,
-          "logprob": -14.5234375,
+          "logprob": -14.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3671875,
+          "logprob": -12.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0390625,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
@@ -57849,42 +57849,42 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8359375,
+          "logprob": -11.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1015625,
+          "logprob": -13.046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3515625,
+          "logprob": -13.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.25,
+          "logprob": -11.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.15625,
+          "logprob": -14.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.90625,
+          "logprob": -13.8515625,
           "text": "<image>"
         },
         {
@@ -57894,12 +57894,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6953125,
+          "logprob": -11.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -10.5625,
+          "logprob": -10.65625,
           "text": "<image>"
         },
         {
@@ -57909,12 +57909,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2265625,
+          "logprob": -12.234375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.8984375,
           "text": "<image>"
         },
         {
@@ -57924,12 +57924,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4453125,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
@@ -57939,27 +57939,27 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9453125,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.8203125,
+          "logprob": -11.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9375,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5078125,
+          "logprob": -12.53125,
           "text": "<image>"
         },
         {
@@ -57969,27 +57969,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.2421875,
+          "logprob": -12.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.546875,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.671875,
+          "logprob": -16.6875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
@@ -57999,7 +57999,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.9453125,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
@@ -58009,82 +58009,82 @@
         },
         {
           "id": 32000,
-          "logprob": -15.203125,
+          "logprob": -15.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6953125,
+          "logprob": -14.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.03125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9453125,
+          "logprob": -12.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1796875,
+          "logprob": -12.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3671875,
+          "logprob": -13.3515625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.796875,
+          "logprob": -13.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4765625,
+          "logprob": -12.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.1640625,
+          "logprob": -12.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.484375,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6328125,
+          "logprob": -14.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.75,
+          "logprob": -13.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3125,
+          "logprob": -13.2578125,
           "text": "<image>"
         },
         {
@@ -58094,12 +58094,12 @@
         },
         {
           "id": 32000,
-          "logprob": -13.484375,
+          "logprob": -13.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2734375,
+          "logprob": -13.265625,
           "text": "<image>"
         },
         {
@@ -58114,17 +58114,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.53125,
+          "logprob": -16.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.1484375,
+          "logprob": -14.171875,
           "text": "<image>"
         },
         {
@@ -58134,17 +58134,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.8125,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.03125,
+          "logprob": -15.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.484375,
+          "logprob": -15.4453125,
           "text": "<image>"
         },
         {
@@ -58159,37 +58159,37 @@
         },
         {
           "id": 32000,
-          "logprob": -10.6953125,
+          "logprob": -10.703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1640625,
+          "logprob": -11.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.84375,
+          "logprob": -12.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5625,
+          "logprob": -12.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7421875,
+          "logprob": -12.734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -18.0,
+          "logprob": -17.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5859375,
+          "logprob": -11.59375,
           "text": "<image>"
         },
         {
@@ -58204,12 +58204,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.6796875,
+          "logprob": -11.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.71875,
+          "logprob": -12.8515625,
           "text": "<image>"
         },
         {
@@ -58219,27 +58219,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1875,
+          "logprob": -12.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7578125,
+          "logprob": -12.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.125,
+          "logprob": -15.140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.9140625,
+          "logprob": -14.9375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.546875,
+          "logprob": -15.6875,
           "text": "<image>"
         },
         {
@@ -58249,22 +58249,22 @@
         },
         {
           "id": 32000,
-          "logprob": -14.109375,
+          "logprob": -14.125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.234375,
+          "logprob": -13.2265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.15625,
+          "logprob": -13.1640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.703125,
           "text": "<image>"
         },
         {
@@ -58274,7 +58274,7 @@
         },
         {
           "id": 32000,
-          "logprob": -14.7734375,
+          "logprob": -14.765625,
           "text": "<image>"
         },
         {
@@ -58284,12 +58284,12 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7421875,
+          "logprob": -11.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.921875,
+          "logprob": -11.9296875,
           "text": "<image>"
         },
         {
@@ -58299,17 +58299,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.6015625,
+          "logprob": -13.6328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.4296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5,
+          "logprob": -13.5078125,
           "text": "<image>"
         },
         {
@@ -58324,27 +58324,27 @@
         },
         {
           "id": 32000,
-          "logprob": -12.1328125,
+          "logprob": -12.15625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.90625,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.578125,
+          "logprob": -12.5625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3359375,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.734375,
           "text": "<image>"
         },
         {
@@ -58354,47 +58354,47 @@
         },
         {
           "id": 32000,
-          "logprob": -12.6875,
+          "logprob": -12.75,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.796875,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4296875,
+          "logprob": -13.5,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5625,
+          "logprob": -11.578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0859375,
+          "logprob": -11.09375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4921875,
+          "logprob": -12.5390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4375,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -19.65625,
+          "logprob": -19.671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4765625,
           "text": "<image>"
         },
         {
@@ -58404,37 +58404,37 @@
         },
         {
           "id": 32000,
-          "logprob": -13.859375,
+          "logprob": -13.890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.34375,
+          "logprob": -13.3203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.828125,
+          "logprob": -12.8359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.109375,
+          "logprob": -12.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.71875,
+          "logprob": -13.7265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.3046875,
           "text": "<image>"
         },
         {
@@ -58444,47 +58444,47 @@
         },
         {
           "id": 32000,
-          "logprob": -11.953125,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8984375,
+          "logprob": -13.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.7109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.234375,
+          "logprob": -16.171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.703125,
+          "logprob": -11.6953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.4921875,
+          "logprob": -13.4765625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.2109375,
+          "logprob": -15.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.5546875,
+          "logprob": -11.53125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5703125,
+          "logprob": -14.578125,
           "text": "<image>"
         },
         {
@@ -58494,47 +58494,47 @@
         },
         {
           "id": 32000,
-          "logprob": -14.046875,
+          "logprob": -14.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.96875,
+          "logprob": -12.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0234375,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3828125,
+          "logprob": -12.3671875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3046875,
+          "logprob": -11.296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2265625,
+          "logprob": -14.2421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.078125,
+          "logprob": -12.0703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.6171875,
+          "logprob": -12.609375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5390625,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
@@ -58544,87 +58544,87 @@
         },
         {
           "id": 32000,
-          "logprob": -11.7265625,
+          "logprob": -11.7421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0703125,
+          "logprob": -15.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.375,
+          "logprob": -14.390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.1484375,
+          "logprob": -11.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.1796875,
+          "logprob": -13.1171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4140625,
+          "logprob": -11.421875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.796875,
+          "logprob": -15.8203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.0390625,
+          "logprob": -13.3125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.453125,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.2890625,
+          "logprob": -14.2578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.671875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5546875,
+          "logprob": -12.546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.375,
+          "logprob": -16.34375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.8828125,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.6171875,
+          "logprob": -14.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.625,
+          "logprob": -12.6015625,
           "text": "<image>"
         },
         {
@@ -58639,22 +58639,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.75,
+          "logprob": -12.7578125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.6875,
+          "logprob": -11.6796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.5625,
+          "logprob": -14.5703125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.34375,
+          "logprob": -11.3515625,
           "text": "<image>"
         },
         {
@@ -58664,62 +58664,62 @@
         },
         {
           "id": 32000,
-          "logprob": -10.984375,
+          "logprob": -10.9296875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.2109375,
+          "logprob": -13.203125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.2734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.234375,
+          "logprob": -12.1796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.953125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9296875,
+          "logprob": -12.90625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0625,
+          "logprob": -12.0546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.4609375,
+          "logprob": -11.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.953125,
+          "logprob": -14.9453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6484375,
+          "logprob": -13.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.296875,
+          "logprob": -11.2890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9375,
+          "logprob": -12.9609375,
           "text": "<image>"
         },
         {
@@ -58734,17 +58734,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.265625,
+          "logprob": -12.28125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.3203125,
+          "logprob": -12.328125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.1953125,
+          "logprob": -15.1875,
           "text": "<image>"
         },
         {
@@ -58754,17 +58754,17 @@
         },
         {
           "id": 32000,
-          "logprob": -12.09375,
+          "logprob": -12.078125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.0546875,
+          "logprob": -15.0625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -15.59375,
+          "logprob": -15.5703125,
           "text": "<image>"
         },
         {
@@ -58774,22 +58774,22 @@
         },
         {
           "id": 32000,
-          "logprob": -12.3515625,
+          "logprob": -12.3359375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.90625,
+          "logprob": -14.9140625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.609375,
+          "logprob": -12.6171875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.671875,
+          "logprob": -14.6640625,
           "text": "<image>"
         },
         {
@@ -58799,17 +58799,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.2265625,
+          "logprob": -15.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.78125,
+          "logprob": -11.796875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.6875,
+          "logprob": -13.703125,
           "text": "<image>"
         },
         {
@@ -58819,37 +58819,37 @@
         },
         {
           "id": 32000,
-          "logprob": -11.796875,
+          "logprob": -11.7890625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.03125,
+          "logprob": -12.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.875,
+          "logprob": -12.8828125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.515625,
+          "logprob": -16.375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.7734375,
+          "logprob": -12.78125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.4609375,
+          "logprob": -12.4453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3984375,
+          "logprob": -13.3828125,
           "text": "<image>"
         },
         {
@@ -58864,17 +58864,17 @@
         },
         {
           "id": 32000,
-          "logprob": -11.4375,
+          "logprob": -11.453125,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.734375,
+          "logprob": -12.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.828125,
+          "logprob": -11.84375,
           "text": "<image>"
         },
         {
@@ -58884,67 +58884,67 @@
         },
         {
           "id": 32000,
-          "logprob": -14.734375,
+          "logprob": -14.71875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.3984375,
+          "logprob": -14.265625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.0390625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.578125,
+          "logprob": -13.6015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.578125,
+          "logprob": -13.6484375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.3359375,
+          "logprob": -11.3046875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.984375,
+          "logprob": -11.96875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.421875,
+          "logprob": -13.3984375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.9140625,
+          "logprob": -12.921875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -14.34375,
+          "logprob": -14.2109375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.8828125,
+          "logprob": -12.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.890625,
+          "logprob": -13.875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.3203125,
+          "logprob": -13.359375,
           "text": "<image>"
         },
         {
@@ -58954,7 +58954,7 @@
         },
         {
           "id": 32000,
-          "logprob": -11.9765625,
+          "logprob": -11.984375,
           "text": "<image>"
         },
         {
@@ -58964,12 +58964,12 @@
         },
         {
           "id": 32000,
-          "logprob": -12.0078125,
+          "logprob": -12.015625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.0390625,
+          "logprob": -11.03125,
           "text": "<image>"
         },
         {
@@ -58984,17 +58984,17 @@
         },
         {
           "id": 32000,
-          "logprob": -13.8203125,
+          "logprob": -13.7734375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -13.5078125,
+          "logprob": -13.5546875,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.734375,
+          "logprob": -11.7265625,
           "text": "<image>"
         },
         {
@@ -59004,22 +59004,22 @@
         },
         {
           "id": 32000,
-          "logprob": -17.3125,
+          "logprob": -16.84375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5234375,
+          "logprob": -12.5859375,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -17.625,
+          "logprob": -17.640625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -11.9296875,
+          "logprob": -11.9453125,
           "text": "<image>"
         },
         {
@@ -59029,17 +59029,17 @@
         },
         {
           "id": 32000,
-          "logprob": -15.9140625,
+          "logprob": -16.0,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -16.65625,
+          "logprob": -16.625,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -12.5,
+          "logprob": -12.4921875,
           "text": "<image>"
         },
         {
@@ -59049,17 +59049,17 @@
         },
         {
           "id": 368,
-          "logprob": -0.19726562,
+          "logprob": -0.19604492,
           "text": "you"
         },
         {
           "id": 1912,
-          "logprob": -1.4990234,
+          "logprob": -1.5058594,
           "text": "tell"
         },
         {
           "id": 528,
-          "logprob": -0.31152344,
+          "logprob": -0.31030273,
           "text": "me"
         },
         {
@@ -59074,22 +59074,22 @@
         },
         {
           "id": 2485,
-          "logprob": -0.9941406,
+          "logprob": -0.9975586,
           "text": "short"
         },
         {
           "id": 2838,
-          "logprob": -0.46118164,
+          "logprob": -0.4633789,
           "text": "story"
         },
         {
           "id": 2818,
-          "logprob": -3.3183594,
+          "logprob": -3.3144531,
           "text": "based"
         },
         {
           "id": 356,
-          "logprob": -0.029129028,
+          "logprob": -0.029037476,
           "text": "on"
         },
         {
@@ -59099,12 +59099,12 @@
         },
         {
           "id": 3469,
-          "logprob": -0.29052734,
+          "logprob": -0.2890625,
           "text": "image"
         },
         {
           "id": 28804,
-          "logprob": -0.43188477,
+          "logprob": -0.42895508,
           "text": "?"
         }
       ],
@@ -59112,13 +59112,13 @@
       "tokens": [
         {
           "id": 13,
-          "logprob": -0.0076828003,
+          "logprob": -0.007621765,
           "special": false,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.19958496,
+          "logprob": -0.20812988,
           "special": false,
           "text": "\n"
         },
@@ -59130,37 +59130,37 @@
         },
         {
           "id": 3714,
-          "logprob": -0.20861816,
+          "logprob": -0.20825195,
           "special": false,
           "text": " upon"
         },
         {
           "id": 264,
-          "logprob": -0.0017719269,
+          "logprob": -0.0017709732,
           "special": false,
           "text": " a"
         },
         {
           "id": 727,
-          "logprob": -0.011749268,
+          "logprob": -0.011932373,
           "special": false,
           "text": " time"
         },
         {
           "id": 28725,
-          "logprob": -0.17529297,
+          "logprob": -0.17297363,
           "special": false,
           "text": ","
         },
         {
           "id": 736,
-          "logprob": -0.9086914,
+          "logprob": -0.9057617,
           "special": false,
           "text": " there"
         },
         {
           "id": 403,
-          "logprob": -0.056732178,
+          "logprob": -0.05758667,
           "special": false,
           "text": " was"
         },
diff --git a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfdd2cc3f184745d1a00a5b869509f33e54c6d0d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json
@@ -0,0 +1,251 @@
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.27416992,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.17016602,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -2.7109375,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.5,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.34204102,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 459,
+        "logprob": -1.6914062,
+        "special": false,
+        "text": " not"
+      },
+      {
+        "id": 1864,
+        "logprob": -0.69140625,
+        "special": false,
+        "text": " sure"
+      },
+      {
+        "id": 513,
+        "logprob": -1.6171875,
+        "special": false,
+        "text": " if"
+      },
+      {
+        "id": 315,
+        "logprob": -1.3837891,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 541,
+        "logprob": -1.2226562,
+        "special": false,
+        "text": " can"
+      },
+      {
+        "id": 1567,
+        "logprob": -1.8652344,
+        "special": false,
+        "text": " come"
+      },
+      {
+        "id": 582,
+        "logprob": -0.0070228577,
+        "special": false,
+        "text": " up"
+      },
+      {
+        "id": 395,
+        "logprob": -0.0054092407,
+        "special": false,
+        "text": " with"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.62597656,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.0035572052,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 4842,
+        "logprob": -0.93603516,
+        "special": false,
+        "text": " unique"
+      },
+      {
+        "id": 3085,
+        "logprob": -0.028411865,
+        "special": false,
+        "text": " words"
+      },
+      {
+        "id": 369,
+        "logprob": -1.0400391,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 6685,
+        "logprob": -0.09710693,
+        "special": false,
+        "text": " describe"
+      },
+      {
+        "id": 528,
+        "logprob": -0.066467285,
+        "special": false,
+        "text": " me"
+      },
+      {
+        "id": 28725,
+        "logprob": -1.0722656,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 562,
+        "logprob": -0.33422852,
+        "special": false,
+        "text": " but"
+      },
+      {
+        "id": 315,
+        "logprob": -0.5136719,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -0.8989258,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 584,
+        "logprob": -0.2076416,
+        "special": false,
+        "text": "ll"
+      },
+      {
+        "id": 1464,
+        "logprob": -0.8808594,
+        "special": false,
+        "text": " try"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.88427734,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13,
+        "logprob": -0.91064453,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.08105469,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28740,
+        "logprob": -1.8486328,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.111572266,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 23626,
+        "logprob": -3.15625,
+        "special": false,
+        "text": " Creative"
+      },
+      {
+        "id": 13,
+        "logprob": -0.9194336,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28750,
+        "logprob": -0.24841309,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 28723,
+        "logprob": -9.393692e-05,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 6785,
+        "logprob": -3.1386719,
+        "special": false,
+        "text": " Fun"
+      },
+      {
+        "id": 1780,
+        "logprob": -0.53564453,
+        "special": false,
+        "text": "ny"
+      },
+      {
+        "id": 13,
+        "logprob": -0.09033203,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28770,
+        "logprob": -0.00466156,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.00016450882,
+        "special": false,
+        "text": "."
+      }
+    ]
+  },
+  "generated_text": "\n\nI’m not sure if I can come up with 3 unique words that describe me, but I’ll try.\n\n1. Creative\n2. Funny\n3."
+}
diff --git a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json
new file mode 100644
index 0000000000000000000000000000000000000000..91eb5edffb4f9a5993a441bc90902f18aaa60937
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json
@@ -0,0 +1,53 @@
+{
+  "details": {
+    "finish_reason": "eos_token",
+    "generated_tokens": 7,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 1,
+        "logprob": -0.49658203,
+        "special": true,
+        "text": "<s>"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.0016384125,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 1,
+        "logprob": -1.4931641,
+        "special": true,
+        "text": "<s>"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.00075769424,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -0.25024414,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28740,
+        "logprob": -0.2631836,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 2,
+        "logprob": -0.0003285408,
+        "special": true,
+        "text": "</s>"
+      }
+    ]
+  },
+  "generated_text": "  11"
+}
diff --git a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json
new file mode 100644
index 0000000000000000000000000000000000000000..130186884d673d9a5a78392058847d30984813f9
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json
@@ -0,0 +1,251 @@
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.0488281,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0800781,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -2.1152344,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -1.6748047,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -0.097229004,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.16467285,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 7615,
+        "logprob": -2.2246094,
+        "special": false,
+        "text": " News"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0488281,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.69189453,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.013343811,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28750,
+        "logprob": -0.011230469,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.00096845627,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 21095,
+        "logprob": -2.5605469,
+        "special": false,
+        "text": " Blog"
+      },
+      {
+        "id": 13,
+        "logprob": -0.19458008,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.031280518,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.0030708313,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.0029277802,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0012350082,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 20108,
+        "logprob": -2.1582031,
+        "special": false,
+        "text": " Article"
+      },
+      {
+        "id": 13,
+        "logprob": -0.05810547,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.35083008,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.034332275,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28781,
+        "logprob": -0.009666443,
+        "special": false,
+        "text": "4"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0013113022,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 8349,
+        "logprob": -2.6191406,
+        "special": false,
+        "text": " Review"
+      },
+      {
+        "id": 13,
+        "logprob": -0.04031372,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.45239258,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.045410156,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28782,
+        "logprob": -0.0041236877,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0010223389,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 5299,
+        "logprob": -2.8066406,
+        "special": false,
+        "text": " Other"
+      },
+      {
+        "id": 13,
+        "logprob": -0.12054443,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.44580078,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.4921875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.3574219,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0039062,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.5859375,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.43481445,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.2783203,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.20410156,
+        "special": false,
+        "text": "\n"
+      }
+    ]
+  },
+  "generated_text": "\n\n### 1. News\n### 2. Blog\n### 3. Article\n### 4. Review\n### 5. Other\n\n\n\n\n\n\n\n\n"
+}
diff --git a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c00dee75a2d1ba150cbae78e51aec83b538fc74
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json
@@ -0,0 +1,251 @@
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.31347656,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.27441406,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -2.2285156,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.4677734,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.31762695,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 264,
+        "logprob": -1.6865234,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1215,
+        "logprob": -3.2695312,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 20640,
+        "logprob": -3.1230469,
+        "special": false,
+        "text": " passionate"
+      },
+      {
+        "id": 1338,
+        "logprob": -0.48339844,
+        "special": false,
+        "text": " person"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.9970703,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 315,
+        "logprob": -0.5498047,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.1923828,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.080444336,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 1215,
+        "logprob": -1.8271484,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 12215,
+        "logprob": -2.8847656,
+        "special": false,
+        "text": " driven"
+      },
+      {
+        "id": 28723,
+        "logprob": -1.0927734,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 315,
+        "logprob": -0.4584961,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -0.5019531,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.030715942,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 1215,
+        "logprob": -0.96972656,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 7798,
+        "logprob": -2.8847656,
+        "special": false,
+        "text": " determined"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.27319336,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13,
+        "logprob": -0.56396484,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.011016846,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3195,
+        "logprob": -0.7163086,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 349,
+        "logprob": -1.1611328,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 574,
+        "logprob": -0.515625,
+        "special": false,
+        "text": " your"
+      },
+      {
+        "id": 6656,
+        "logprob": -1.0253906,
+        "special": false,
+        "text": " favorite"
+      },
+      {
+        "id": 1970,
+        "logprob": -2.1738281,
+        "special": false,
+        "text": " thing"
+      },
+      {
+        "id": 684,
+        "logprob": -0.48364258,
+        "special": false,
+        "text": " about"
+      },
+      {
+        "id": 1250,
+        "logprob": -1.8876953,
+        "special": false,
+        "text": " being"
+      },
+      {
+        "id": 264,
+        "logprob": -0.41967773,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 8626,
+        "logprob": -2.9160156,
+        "special": false,
+        "text": " teacher"
+      },
+      {
+        "id": 28804,
+        "logprob": -0.11920166,
+        "special": false,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -0.023727417,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.010848999,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -1.0566406,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 2016,
+        "logprob": -0.7163086,
+        "special": false,
+        "text": " love"
+      },
+      {
+        "id": 272,
+        "logprob": -1.9169922,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 1639,
+        "logprob": -2.03125,
+        "special": false,
+        "text": " fact"
+      }
+    ]
+  },
+  "generated_text": "\n\nI’m a very passionate person. I’m very driven. I’m very determined.\n\nWhat is your favorite thing about being a teacher?\n\nI love the fact"
+}
diff --git a/integration-tests/models/__snapshots__/test_mamba/test_mamba.json b/integration-tests/models/__snapshots__/test_mamba/test_mamba.json
index eaba5078396f26820a1c16fd9019473491e2a7aa..9079b3bdb0e202f9ad06c2595464dbe12ee8b3e9 100644
--- a/integration-tests/models/__snapshots__/test_mamba/test_mamba.json
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba.json
@@ -14,55 +14,55 @@
       },
       {
         "id": 187,
-        "logprob": -0.26953125,
+        "logprob": -0.35742188,
         "special": false,
         "text": "\n"
       },
       {
         "id": 30763,
-        "logprob": -1.1953125,
+        "logprob": -1.1015625,
         "special": false,
         "text": "Deep"
       },
       {
         "id": 4715,
-        "logprob": -0.53515625,
+        "logprob": -0.5234375,
         "special": false,
         "text": " learning"
       },
       {
         "id": 310,
-        "logprob": -0.625,
+        "logprob": -0.55078125,
         "special": false,
         "text": " is"
       },
       {
         "id": 247,
-        "logprob": -0.6796875,
+        "logprob": -0.6640625,
         "special": false,
         "text": " a"
       },
       {
         "id": 747,
-        "logprob": -2.0,
+        "logprob": -2.0625,
         "special": false,
         "text": " new"
       },
       {
         "id": 1511,
-        "logprob": -2.3125,
+        "logprob": -2.375,
         "special": false,
         "text": " type"
       },
       {
         "id": 273,
-        "logprob": -0.0028533936,
+        "logprob": -0.0029144287,
         "special": false,
         "text": " of"
       },
       {
         "id": 5145,
-        "logprob": -1.265625,
+        "logprob": -1.2734375,
         "special": false,
         "text": " machine"
       }
diff --git a/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json b/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
index 85e9a9e04c823da09b894c4e6c3d46ccca9d1fcf..ef88926ca59066274fd6cae7af809d03a29a90de 100644
--- a/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
@@ -52,7 +52,7 @@
       },
       {
         "id": 9830,
-        "logprob": -1.65625,
+        "logprob": -2.25,
         "special": false,
         "text": " colors"
       },
@@ -64,13 +64,13 @@
       },
       {
         "id": 329,
-        "logprob": -2.4375,
+        "logprob": -2.171875,
         "special": false,
         "text": " A"
       },
       {
         "id": 1180,
-        "logprob": -1.953125,
+        "logprob": -2.046875,
         "special": false,
         "text": " number"
       },
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..df3b59689673e97bc66750166091a0a696c2a1fb
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
@@ -0,0 +1,106 @@
+[
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a bustling city, a chicken named Cluck",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.4.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.4.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.4.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.4.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bd0c09cdf691ee53eb728cdada74c4dc9734ada
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "In a bustling city, a chicken named Cluck",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1727556016,
+  "id": "",
+  "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "2.4.1-dev0-native",
+  "usage": {
+    "completion_tokens": 10,
+    "prompt_tokens": 50,
+    "total_tokens": 60
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
index 5cacf3e96aa19e7c78bf2ae3c19771fe9fdab8a1..9fd950a219ac4c4e95652a8edb411c9b818070c2 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
@@ -26,19 +26,19 @@
       },
       {
         "id": 259,
-        "logprob": -0.4716797,
+        "logprob": -0.47070312,
         "special": false,
         "text": " "
       },
       {
         "id": 261,
-        "logprob": -0.044677734,
+        "logprob": -0.15307617,
         "special": false,
         "text": ","
       },
       {
         "id": 35622,
-        "logprob": -0.79589844,
+        "logprob": -0.796875,
         "special": false,
         "text": " cloud"
       },
@@ -56,7 +56,7 @@
       },
       {
         "id": 35622,
-        "logprob": -1.1630859,
+        "logprob": -1.2998047,
         "special": false,
         "text": " cloud"
       },
@@ -75,5 +75,5 @@
     ],
     "top_tokens": null
   },
-  "generated_text": "Why is the sky blue?blue sky, clouds and clouds"
+  "generated_text": "Why is the sky blue?blue sky , clouds and clouds"
 }
diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json
new file mode 100644
index 0000000000000000000000000000000000000000..69c1f47d816936396a628376f6049c45f480121e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.8359375,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -9.6171875,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -2.3417969,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -1.8730469,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -1.2626953,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -1.7060547,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -1.4482422,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.15246582,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -0.796875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -0.22766113,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.007045746,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -0.021759033,
+        "special": false,
+        "text": "\n"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nTest request\nTest request\nTest request\n"
+}
diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b5ee9ee2eb4572c4558fe393af9f3b8c318e823
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.7890625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -9.625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 29899,
+        "logprob": -1.4980469,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 1454,
+        "logprob": -0.19433594,
+        "special": false,
+        "text": "for"
+      },
+      {
+        "id": 29899,
+        "logprob": 0.0,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 9342,
+        "logprob": 0.0,
+        "special": false,
+        "text": "comment"
+      },
+      {
+        "id": 29901,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 396,
+        "logprob": -0.27392578,
+        "special": false,
+        "text": " #"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.49389648,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29900,
+        "logprob": -0.81103516,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 29896,
+        "logprob": 0.0,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29955,
+        "logprob": -1.0800781,
+        "special": false,
+        "text": "7"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request-for-comment: #2017"
+}
diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json
new file mode 100644
index 0000000000000000000000000000000000000000..df975635a3197c90210c88945d11137350ae1e39
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.8828125,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.5859375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8623047,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2451172,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.6923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4492188,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.15197754,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.8022461,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22583008,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.007095337,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021652222,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.796875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3476562,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8789062,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2734375,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.703125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4677734,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.15454102,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.7973633,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.23278809,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.006980896,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.022033691,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.9296875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.5703125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8486328,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2480469,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.7060547,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4511719,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.1529541,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.81396484,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22180176,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.007133484,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021835327,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.84375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.6171875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3261719,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8691406,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2597656,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.7070312,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4550781,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.1538086,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.79345703,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22924805,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.0070266724,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021942139,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
index 0cd3c67f1ff403cafe90b212f54a3ddd295be1c0..cc0f3ae079bb9d4eb0ff642efca252a9be970a54 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
@@ -1,38 +1,26 @@
 {
   "choices": [
     {
-      "finish_reason": "eos_token",
+      "finish_reason": "stop",
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": null,
+        "content": "I am an AI assistant",
         "name": null,
         "role": "assistant",
-        "tool_calls": [
-          {
-            "function": {
-              "arguments": {
-                "error": "Cannot get current weather forecast from specified location and temperature unit. Please try again with different options."
-              },
-              "description": null,
-              "name": "notify_error"
-            },
-            "id": 0,
-            "type": "function"
-          }
-        ]
+        "tool_calls": null
       },
       "usage": null
     }
   ],
-  "created": 1712852597,
+  "created": 1728497062,
   "id": "",
-  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-  "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "2.4.1-dev0-native",
   "usage": {
-    "completion_tokens": 39,
-    "prompt_tokens": 496,
-    "total_tokens": 535
+    "completion_tokens": 23,
+    "prompt_tokens": 604,
+    "total_tokens": 627
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
new file mode 100644
index 0000000000000000000000000000000000000000..b217dbe7bc61351a2e84666aac2d79f8f6c9b82f
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": " assistant",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": null,
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1728497531,
+  "id": "",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "2.4.1-dev0-native",
+  "usage": null
+}
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6c45f89e3a98d8b8ce538d44a11d1eaea81b404
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
@@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": " fans",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": null,
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1728497461,
+  "id": "",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "2.4.1-dev0-native",
+  "usage": null
+}
diff --git a/integration-tests/models/test_chat_llama.py b/integration-tests/models/test_chat_llama.py
index 10df6dbdaae2cfc4ffb80a0e61bf90ced4bcef1b..7d24add3633479f514266b82c462289e014e05d7 100644
--- a/integration-tests/models/test_chat_llama.py
+++ b/integration-tests/models/test_chat_llama.py
@@ -1,7 +1,4 @@
 import pytest
-import json
-
-from text_generation.types import GrammarType
 
 
 @pytest.fixture(scope="module")
@@ -38,6 +35,6 @@ async def test_flash_llama_simple(flash_llama_chat, response_snapshot):
     print(repr(response.choices[0].message.content))
     assert (
         response.choices[0].message.content
-        == "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to explore in the middle of urban confines. In fact, typical times for humidity levels in Brooklyn include:\n\n- Early morning: 80-85% humidity, with occas"
+        == "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to appreciate nature.\n\nIn terms of temperature, the warmest times of the year are from June to August, when average high temperatures typically range from around 73°F or 23°C"
     )
     assert response == response_snapshot
diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py
index 0efb66938628235a7cb7f266d9be5bea4bb732d9..6c359f1e9ad89128b03608f03ff945ee05a55e29 100644
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@@ -3,15 +3,13 @@ import requests
 import json
 from aiohttp import ClientSession
 
-from text_generation.types import (
-    Completion,
-)
+from text_generation.types import Completion, ChatCompletionChunk
 
 
 @pytest.fixture(scope="module")
 def flash_llama_completion_handle(launcher):
     with launcher(
-        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
     ) as handle:
         yield handle
 
@@ -34,28 +32,145 @@ def test_flash_llama_completion_single_prompt(
         f"{flash_llama_completion.base_url}/v1/completions",
         json={
             "model": "tgi",
-            "prompt": "Say this is a test",
-            "max_tokens": 5,
-            "seed": 0,
+            "prompt": "What is Deep Learning?",
+            "max_tokens": 10,
+            "temperature": 0.0,
         },
         headers=flash_llama_completion.headers,
         stream=False,
     )
     response = response.json()
     assert len(response["choices"]) == 1
-
+    assert (
+        response["choices"][0]["text"]
+        == " A Beginner’s Guide\nDeep learning is a subset"
+    )
     assert response == response_snapshot
 
 
+@pytest.mark.release
+async def test_flash_llama_completion_stream_usage(
+    flash_llama_completion, response_snapshot
+):
+    url = f"{flash_llama_completion.base_url}/v1/chat/completions"
+    request = {
+        "model": "tgi",
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is Deep Learning?",
+            }
+        ],
+        "max_tokens": 10,
+        "temperature": 0.0,
+        "stream_options": {"include_usage": True},
+        "stream": True,
+    }
+    string = ""
+    chunks = []
+    had_usage = False
+    async with ClientSession(headers=flash_llama_completion.headers) as session:
+        async with session.post(url, json=request) as response:
+            # iterate over the stream
+            async for chunk in response.content.iter_any():
+                # remove "data:"
+                chunk = chunk.decode().split("\n\n")
+                # remove "data:" if present
+                chunk = [c.replace("data:", "") for c in chunk]
+                # remove empty strings
+                chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
+                # parse json
+                chunk = [json.loads(c) for c in chunk]
+
+                for c in chunk:
+                    chunks.append(ChatCompletionChunk(**c))
+                    assert "choices" in c
+                    if len(c["choices"]) == 1:
+                        index = c["choices"][0]["index"]
+                        assert index == 0
+                        string += c["choices"][0]["delta"]["content"]
+
+                        has_usage = c["usage"] is not None
+                        assert not had_usage
+                        if has_usage:
+                            had_usage = True
+                    else:
+                        raise RuntimeError("Expected different payload")
+    assert had_usage
+    assert (
+        string
+        == "**Deep Learning: An Overview**\n=====================================\n\n"
+    )
+    assert chunks == response_snapshot
+
+    request = {
+        "model": "tgi",
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is Deep Learning?",
+            }
+        ],
+        "max_tokens": 10,
+        "temperature": 0.0,
+        "stream": True,
+    }
+    string = ""
+    chunks = []
+    had_usage = False
+    async with ClientSession(headers=flash_llama_completion.headers) as session:
+        async with session.post(url, json=request) as response:
+            # iterate over the stream
+            async for chunk in response.content.iter_any():
+                # remove "data:"
+                chunk = chunk.decode().split("\n\n")
+                # remove "data:" if present
+                chunk = [c.replace("data:", "") for c in chunk]
+                # remove empty strings
+                chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
+                # parse json
+                chunk = [json.loads(c) for c in chunk]
+
+                for c in chunk:
+                    chunks.append(ChatCompletionChunk(**c))
+                    assert "choices" in c
+                    if len(c["choices"]) == 1:
+                        index = c["choices"][0]["index"]
+                        assert index == 0
+                        string += c["choices"][0]["delta"]["content"]
+
+                        has_usage = c["usage"] is not None
+                        assert not had_usage
+                        if has_usage:
+                            had_usage = True
+                    else:
+                        raise RuntimeError("Expected different payload")
+    assert not had_usage
+    assert (
+        string
+        == "**Deep Learning: An Overview**\n=====================================\n\n"
+    )
+
+
 @pytest.mark.release
 def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
     response = requests.post(
         f"{flash_llama_completion.base_url}/v1/completions",
         json={
             "model": "tgi",
-            "prompt": ["Say", "this", "is", "a"],
+            "prompt": [
+                "What is Deep Learning?",
+                "Is water wet?",
+                "What is the capital of France?",
+                "def mai",
+            ],
             "max_tokens": 10,
             "seed": 0,
+            "temperature": 0.0,
         },
         headers=flash_llama_completion.headers,
         stream=False,
@@ -63,9 +178,16 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
     response = response.json()
     assert len(response["choices"]) == 4
 
-    all_indexes = [choice["index"] for choice in response["choices"]]
+    all_indexes = [(choice["index"], choice["text"]) for choice in response["choices"]]
     all_indexes.sort()
-    assert all_indexes == [0, 1, 2, 3]
+    all_indices, all_strings = zip(*all_indexes)
+    assert list(all_indices) == [0, 1, 2, 3]
+    assert list(all_strings) == [
+        " A Beginner’s Guide\nDeep learning is a subset",
+        " This is a question that has puzzled many people for",
+        " Paris\nWhat is the capital of France?\nThe",
+        'usculas_minusculas(s):\n    """\n',
+    ]
 
     assert response == response_snapshot
 
@@ -77,19 +199,21 @@ async def test_flash_llama_completion_many_prompts_stream(
     request = {
         "model": "tgi",
         "prompt": [
-            "What color is the sky?",
+            "What is Deep Learning?",
             "Is water wet?",
             "What is the capital of France?",
             "def mai",
         ],
         "max_tokens": 10,
         "seed": 0,
+        "temperature": 0.0,
         "stream": True,
     }
 
     url = f"{flash_llama_completion.base_url}/v1/completions"
 
     chunks = []
+    strings = [""] * 4
     async with ClientSession(headers=flash_llama_completion.headers) as session:
         async with session.post(url, json=request) as response:
             # iterate over the stream
@@ -100,13 +224,23 @@ async def test_flash_llama_completion_many_prompts_stream(
                 chunk = [c.replace("data:", "") for c in chunk]
                 # remove empty strings
                 chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
                 # parse json
                 chunk = [json.loads(c) for c in chunk]
 
                 for c in chunk:
                     chunks.append(Completion(**c))
                     assert "choices" in c
-                    assert 0 <= c["choices"][0]["index"] <= 4
+                    index = c["choices"][0]["index"]
+                    assert 0 <= index <= 4
+                    strings[index] += c["choices"][0]["text"]
 
     assert response.status == 200
+    assert list(strings) == [
+        " A Beginner’s Guide\nDeep learning is a subset",
+        " This is a question that has puzzled many people for",
+        " Paris\nWhat is the capital of France?\nThe",
+        'usculas_minusculas(s):\n    """\n',
+    ]
     assert chunks == response_snapshot
diff --git a/integration-tests/models/test_flash_deepseek_v2.py b/integration-tests/models/test_flash_deepseek_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..010e08c905957a8c99508ac5472b65094bd87f65
--- /dev/null
+++ b/integration-tests/models/test_flash_deepseek_v2.py
@@ -0,0 +1,63 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_deepseek_v2_handle(launcher):
+    with launcher("deepseek-ai/DeepSeek-V2-Lite", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_deepseek_v2(flash_deepseek_v2_handle):
+    await flash_deepseek_v2_handle.health(300)
+    return flash_deepseek_v2_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_deepseek_v2(flash_deepseek_v2, response_snapshot):
+    response = await flash_deepseek_v2.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_deepseek_v2_all_params(flash_deepseek_v2, response_snapshot):
+    response = await flash_deepseek_v2.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_deepseek_v2_load(
+    flash_deepseek_v2, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_deepseek_v2, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_gemma.py b/integration-tests/models/test_flash_gemma.py
index 7bee8dea2713d88ce8b67f184e53f787b08427e1..4bd7bd14568e39d586cce6fdf0908b2d05195fa8 100644
--- a/integration-tests/models/test_flash_gemma.py
+++ b/integration-tests/models/test_flash_gemma.py
@@ -16,7 +16,7 @@ async def flash_gemma(flash_gemma_handle):
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_gemma(flash_gemma, response_snapshot):
+async def test_flash_gemma_simple(flash_gemma, response_snapshot):
     response = await flash_gemma.generate(
         "Test request", max_new_tokens=10, decoder_input_details=True
     )
diff --git a/integration-tests/models/test_flash_gemma2.py b/integration-tests/models/test_flash_gemma2.py
new file mode 100644
index 0000000000000000000000000000000000000000..547db49392106a0fa4058b375f8a5ad1e69317c4
--- /dev/null
+++ b/integration-tests/models/test_flash_gemma2.py
@@ -0,0 +1,46 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_gemma2_handle(launcher):
+    with launcher("google/gemma-2-9b-it", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_gemma2(flash_gemma2_handle):
+    await flash_gemma2_handle.health(300)
+    return flash_gemma2_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma2(flash_gemma2, response_snapshot):
+    response = await flash_gemma2.generate(
+        "<start_of_turn>user:\nWrite a poem to help me remember the first 10 elements on the periodic table, giving each element its own line.<end_of_turn>\n<start_of_turn>model:\n",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert response.generated_text == "**Hydrogen**, light and free,\n**He"
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma2_load(flash_gemma2, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_gemma2,
+        "<start_of_turn>user:\nWrite a poem to help me remember the first 10 elements on the periodic table, giving each element its own line.<end_of_turn>\n<start_of_turn>model:\n",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert responses[0].generated_text == "**Hydrogen**, light and free,\n**He"
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_llama.py b/integration-tests/models/test_flash_llama.py
index c69314ffda4029d0baee8e3029b498e8648bde69..bf49dc0b4b01da8f47ddf9f51b50e4c23844c9ea 100644
--- a/integration-tests/models/test_flash_llama.py
+++ b/integration-tests/models/test_flash_llama.py
@@ -15,7 +15,7 @@ async def flash_llama(flash_llama_handle):
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama(flash_llama, response_snapshot):
+async def test_flash_llama_simple(flash_llama, response_snapshot):
     response = await flash_llama.generate(
         "Test request", max_new_tokens=10, decoder_input_details=True
     )
diff --git a/integration-tests/models/test_flash_llama_exl2.py b/integration-tests/models/test_flash_llama_exl2.py
index 7169c9994def57def158264beeb420932fa556a4..18319f608a860944cbe94bddc06bc9c96e6bfb1c 100644
--- a/integration-tests/models/test_flash_llama_exl2.py
+++ b/integration-tests/models/test_flash_llama_exl2.py
@@ -21,7 +21,6 @@ async def flash_llama_exl2(flash_llama_exl2_handle):
     return flash_llama_exl2_handle.client
 
 
-@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_response_snapshot):
@@ -33,7 +32,6 @@ async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_response_snapsh
     assert response == ignore_logprob_response_snapshot
 
 
-@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_exl2_all_params(
@@ -60,7 +58,6 @@ async def test_flash_llama_exl2_all_params(
     assert response == ignore_logprob_response_snapshot
 
 
-@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_exl2_load(
diff --git a/integration-tests/models/test_flash_llama_fp8.py b/integration-tests/models/test_flash_llama_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..1980846d301f2b820391c8627616d071bd5ff291
--- /dev/null
+++ b/integration-tests/models/test_flash_llama_fp8.py
@@ -0,0 +1,65 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_fp8_handle(launcher):
+    with launcher("meta-llama/Meta-Llama-3-8B", num_shard=2, quantize="fp8") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_fp8(flash_llama_fp8_handle):
+    await flash_llama_fp8_handle.health(300)
+    return flash_llama_fp8_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_fp8(flash_llama_fp8, response_snapshot):
+    response = await flash_llama_fp8.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.generated_text == " for the 2019-2020 school year"
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_fp8_all_params(flash_llama_fp8, response_snapshot):
+    response = await flash_llama_fp8.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_fp8_load(flash_llama_fp8, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_llama_fp8, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert responses[0].generated_text == " for the 2019-2020 school year"
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"Different messages : {[r.generated_text for r in responses]}"
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_llama_fp8_kv_cache.py b/integration-tests/models/test_flash_llama_fp8_kv_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd7f78fe6f16b7cb9126b263202e4f31076ec9d
--- /dev/null
+++ b/integration-tests/models/test_flash_llama_fp8_kv_cache.py
@@ -0,0 +1,79 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_fp8_kv_cache_handle(launcher):
+    with launcher(
+        "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+        num_shard=2,
+        kv_cache_dtype="fp8_e4m3fn",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache_handle):
+    await flash_llama_fp8_kv_cache_handle.health(300)
+    return flash_llama_fp8_kv_cache_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, response_snapshot):
+    response = await flash_llama_fp8_kv_cache.generate(
+        "What is deep learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert (
+        response.generated_text
+        == " Deep learning is a subset of machine learning that involves"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_fp8_kv_cache_all_params(
+    flash_llama_fp8_kv_cache, response_snapshot
+):
+    response = await flash_llama_fp8_kv_cache.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_fp8_kv_cache_load(
+    flash_llama_fp8_kv_cache, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_llama_fp8_kv_cache, "What is deep learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert (
+        responses[0].generated_text
+        == " Deep learning is a subset of machine learning that involves"
+    )
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"Different messages : {[r.generated_text for r in responses]}"
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_llama_gptq.py b/integration-tests/models/test_flash_llama_gptq.py
index 135f4b053369f80ba6e995dd9a62eae40cca22df..94a48e4924754e8861ef3e0772c5ce5c3d494a0e 100644
--- a/integration-tests/models/test_flash_llama_gptq.py
+++ b/integration-tests/models/test_flash_llama_gptq.py
@@ -3,7 +3,9 @@ import pytest
 
 @pytest.fixture(scope="module")
 def flash_llama_gptq_handle(launcher):
-    with launcher("huggingface/llama-7b-gptq", num_shard=2, quantize="gptq") as handle:
+    with launcher(
+        "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", num_shard=2, quantize="gptq"
+    ) as handle:
         yield handle
 
 
diff --git a/integration-tests/models/test_flash_llama_marlin_24.py b/integration-tests/models/test_flash_llama_marlin_24.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb94f02e18160e85ad6c93c2bcffd67a8ffc87d
--- /dev/null
+++ b/integration-tests/models/test_flash_llama_marlin_24.py
@@ -0,0 +1,66 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_marlin24_handle(launcher):
+    with launcher(
+        "nm-testing/Llama-2-7b-pruned2.4-Marlin_24", quantize="marlin"
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_marlin(flash_llama_marlin24_handle):
+    await flash_llama_marlin24_handle.health(300)
+    return flash_llama_marlin24_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
+    response = await flash_llama_marlin.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_marlin24_all_params(flash_llama_marlin, response_snapshot):
+    response = await flash_llama_marlin.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_marlin24_load(
+    flash_llama_marlin, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_llama_marlin, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_llama_prefix.py b/integration-tests/models/test_flash_llama_prefix.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e48b0549d55cbf06b2de569f5f998413503f292
--- /dev/null
+++ b/integration-tests/models/test_flash_llama_prefix.py
@@ -0,0 +1,227 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_handle(launcher):
+    with launcher("meta-llama/Meta-Llama-3.1-8B-Instruct", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama(flash_llama_handle):
+    await flash_llama_handle.health(300)
+    return flash_llama_handle.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_load(
+    flash_llama, generate_multi, generous_response_snapshot
+):
+    prompts = [
+        "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...",
+        "How to tell if a customer segment is well segmented? In 3 bullet points.",
+        'In Java, I want to replace string like "This is a new {object} at {place}" with a Map, {object: "student", "point 3, 4"}, and get a result "This is a new student at point 3, 4". How can I do?',
+        "Metaphorical language is also used to describe the various addressing modes of the instructions. Grandiose language to express their excitement and admiration for the functionality of the instructions being described. Now, rewrite this with more perplexity:\n\nJMP ABCD\nMOV AX, [BX+SI]\nMOV AX, [100]\nMOV AX, [BX]\nMOV AX, [BX\\*2+SI]\nMOV AX, BX\nMOV AX, 7",
+        'I have the following C++ function: \nvoid add\\_player(vector& players)\n{\n string player\\_name;\n string player\\_class;\n string dummy;\n PlayerClass pc;\n string player\\_sex;\n int player\\_gold;\n\n cout << " Create a Mage, Warrior, Bowman, or Thief" << endl;\n\n cout << "Name: ";\n getline(cin, player\\_name);\n\n cout << "Class: ";\n getline(cin, player\\_class);\n pc = get\\_player\\_class\\_from\\_string(player\\_class);\n while (pc == PlayerClass::InvalidPlayerClass)\n {\n cout << " Invalid class, try again" << endl;\n cout << "Class: ";\n getline(cin, player\\_class);\n pc = get\\_player\\_class\\_from\\_string(player\\_class);\n }\n\n cout << "Sex: ";\n getline(cin, player\\_sex);\n\n cout << "Gold: ";\n cin >> player\\_gold;\n getline(cin, dummy); //consume newline\n\n GamePlayer new\\_player;\n new\\_player.name = player\\_name;\n new\\_player.occupation = pc;\n new\\_player.gender = player\\_sex;\n new\\_player.gold = player\\_gold;\n\n //add to vector\n players.push\\_back(new\\_player);\n\n //add to file\n write\\_players\\_file(players);\n}\nCan you explain to me how the dummy variable is being used?',
+        "how do I add multiple new columns in m for power query or power bi?",
+        "Sure, I can do that. What new technology would you like me to review?",
+        "Poly Ether Ether Ketone",
+        'can you design a referral system similar on how dropbox did? I need a technical overview on how it should work, instead of free space we use the generic term "credits" where users can get more credits for every 3 friends they recommend.',
+        "Java add to the arraylist of a class type",
+        "this is not less code this is java",
+        "I want to do a road trip from Pune to Gujarat. Me and my wife will be travelling and we dont prefer very long driving sessions. Can you suggest a plan starting from Thursday early morning and ending in Pune on Sunday late night.",
+        "explane more",
+        "what do you think about this for a start up idea:",
+        "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?",
+        "# Import the necessary packages\nfrom gudhi import SimplexTree\nfrom gudhi.persistent\\_homology import PersistentHomology\n\n# Define a function to compute the persistent homology of a Minesweeper game board\ndef minesweeper\\_homology(board):\n # Create a simplicial complex for the game board\n st = SimplexTree()\n\n # Add the points on the board to the simplicial complex\n for i in range(len(board)):\n for j in range(len(board[0])):\n st.insert([i, j], filtration=board[i][j])\n\n # Compute the persistent homology of the game board\n ph = PersistentHomology()\n ph.build(st)\n\n # Return the persistent homology diagram\n return ph.persistence()\n\n# Define a function to solve a Minesweeper game board using persistent homology\ndef minesweeper\\_solver(board):\n # Compute the persistent homology of the game board\n homology = minesweeper\\_homology(board)\n\n # Use the persistent homology to determine the locations of the mines\n # (this part would require some mathematical reasoning and programming)\n mines = []\n for h in homology:\n if h[1] - h[0] == 1: # if the hole persists for one filtration value\n mines.append(h[0]) # then it corresponds to a mine\n\n # Use the information about the mines to solve the game\n # (this part would require some programming)\n for mine in mines:\n i, j = mine # extract the coordinates of the mine\n board[i][j] = -1 # mark the mine on the board\n # (other code to solve the game)\n\n \nwhat is missing here?",
+        "You are now an imaginary expert business investigator. I am going to send you many rows of data. Each batch of row's will be sent to you and you may only reply \"Received.\" Save any analysis or insights for after you've received all of the data and I've told you \"Let's Begin.\" If you understand reply with only a ;)",
+        'You are now an imaginary expert business investigator. Tell the story of this batch of data in the form of a narrative story about the companies in the "Entity Name" column: \n\nBatch of data #1: Entity Name Purpose / Source\n101 PC HOLDINGS LLC Holding company for Penthouse C at the Setai Miami Beach (folio: 02-3234-153-1160)\n11 STAR ISLAND LLC Holding company for 10 STAR ISLAND DR, MIAMI BEACH, FL 33139 (folio: 02-4204-001-0100, 02-4204-001-0110) (lots 10, 11 and 12 of Star Island)\n117 EAST PARK AVENUE, LLC Holding company for 117 E. PARK AVE, LIBERTYVILLE, IL (PIN: 11-21-212-046-0000); subsequently sold.\n1201 BRICKELL BAY, LLC Holding company for 1201 BRICKELL BAY DR, MIAMI, FL (folio no: 141390710010)\n1221 BRICKELL, LLC Holding company for 1221 BRICKELL AVE, 155 SE 13 ST, 165 SE 13 ST, 175 SE 13 ST, and 185 SE 13 ST, MIAMI, FL (folio: 01-4139-035-0010)\n1221 BRICKELL HOLDINGS LLC Holding company for 1221 BRICKELL, LLC\n1229 PARK WEST AVENUE, LLC Holding company for 1229 W. PARK AVE, LIBERTYVILLE, IL (PIN: 11-20-100-010-0000)\n125 WORTH LLC Delaware LLC (file 7218403), Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person; speculaton this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC, this property is next door (PCN: 50-43-43-23-05-016-0380)\n125 WORTH HOLDINGS LLC Delaware LLC (file 7218407); not registered to Florida yet but speculation this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC\n1250 BB ASSET CO LLC Holding company for 1250 BRICKELL BAY DR and 1260 BRICKELL BAY DR, MIAMI, FL (folio nos: 102100504250, 102100503210)\n1330 SOUTH OCEAN LLC Holding company for 1330 S OCEAN BLVD, PALM BEACH, FL (PCN: 50-43-44-02-11-000-0020)\n14 STAR ISLAND LLC Delaware LLC (file 3377653); incorporated 8/42020, withdrawn 10/10/2022; believe this was not used because 14 STAR ISLAND property was held by NAUTILUS HOLDINGS I LLC before sale on 10/5/2022\n151 WORTH, LLC Holding company for 151 WORTH AVE, PALM BEACH, FL 33480 (PCN: 50-43-43-23-05-016-0130); office space for Citadel (https://localtoday.news/fl/citadel-moves-into-palm-beachs-former-neiman-marcus-building-4821.html); sole member is 151 WORTH HOLDINGS LLC\n151 WORTH HOLDINGS LLC Holding company for 151 WORTH, LLC\n16 WILLOW HOLDINGS LLC f/k/a PVNAH LLC Holding company for S WILLOW COURT, ASPEN, CO (Parcel: 273511309030); see Pitkin Co. reception # 623002, Delaware certificate showing name change 9/1/2015\n190 PFISTER HOLDINGS LLC f/k/a AH2013 HOLDINGS LLC Holding company for 190 PFISTER DR, ASPEN, CO (parcel: 273511309029); see Pitkin Co.reception # 623000, Delaware certificate showing name change 9/1/2015\n196 PFISTER HOLDINGS LLC Holding company for 196 PFISTER DR, ASPEN, CO (parcel: 273511309028); see Pitkin Co. reception # 623501, statement of authority show KP HOLDINGS LLC as sole membe\n1ALPH LLC See ALPH LLC\n1BUSINESS GROUP LLC See BUSINESS GROUP LLC\n1GFS DESIGN LLC See GFS DESIGN LLC\n1GFS LLC See GFS LLC\n1MEDIA HOLDINGS LLC See MEDIA HOLDINGS LLC\n23174 NE 41ST PATH LLC Holding company for 23174 NE 41ST PATH #12, OKEECHOBEE, FL 34972 (Parcel: 1-01-35-35-0020-00000-0120); part of Pine Creek Sporting Club (www.pinecreeksportingclub.com) includes horse, shooting sports; sole member is KP HOLDINGS L.L.C.\n3031 BRICKELL LLC Holding company for 3031 BRICKELL AVE, MIAMI FL 33129 (Folio: 01-4139-001-2700); Sole member is KP HOLDINGS L.L.C.\n31 WILLOW HOLDINGS LLC f/k/a AP HOLDINGS I LLC Holding company for 31 NORTH WILLOW COURT, ASPEN, CO (Parcel: 273511309019); sold 7/6/2017; see Pitkin Co. reception # 623001, Delaware certificate showing name change 9/1/2015\n650 CASUARINA LLC Holding company for 650 CASUARINA CONCOURSE CORAL GABLES, FL (folio: 03-4132-019-0060) https://www.bizjournals.com/southflorida/news/2022/05/27/650-casuarina-concourse-coral-gables-sold.html\n650 MEADOW LANE 1 LP Holding company for 650 MEADOW LANE, VILLAGE OF SOUTHAMPTON, NY (Parcel ID 7478) (https://archive.is/h85yq)\n800 NORTH MICHIGAN HOLDINGS LLC Holding company for 800 N MICHIGAN AVE, UNITS 66 PH and 67 PH, CHICAGO, IL (Park Tower) (PINs: 17-03-231-018-1116, 17-03-231-018-1117); sole member is KP HOLDINGS LLC (see Cook County, IL doc # 1933315025); recently sold\n8565 OLD CUTLER LLC Holding company for 8565 OLD CUTLER RD, MIAMI, FL (folio: 03-4132-019-0020)\n9 WEST WALTON HOLDINGS LLC Holding company for 9 WEST WALTON STREET CONDOMINIUM UNITS 3500, 3600, 3700, and PH, CHICAGO, IL\nADRP LLC Delaware LLC, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin\nAH2013 HOLDINGS LLC See 190 PFISTER HOLDINGS LLC\nALPH LLC a/k/a 1ALPH LLC Formerly FAA registered plane N421AL\nAP HOLDINGS I LLC See 31 WILLOW HOLDINGS LLC\nARAGON INVESTMENTS LTD https://files.brokercheck.finra.org/firm/firm\\_45631.pdf\nASHLER CAPITAL LLC https://adviserinfo.sec.gov/firm/summary/148826\nASHLER CAPITAL MASTER FUND LTD https://www.sec.gov/Archives/edgar/data/1003078/000114420418014250/tv488357\\_sc13g.htm\nBANBURY LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBANBURY II LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBKGST LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC See BLOSSOM WAY HOLDINGS LLC\nBLACK WHEEL LLC Illinois LLC, registered 3/5/2014, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin\nBLOSSOM WAY HOLDINGS LLC f/k/a CPPB HOLDINGS LLC f/k/a BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC Holding company for 10 BLOSSOM WAY, 70 BLOSSOM WAY, and 1265 S OCEAN BLVD PALM BEACH, FL (PCNs: 50-43-44-02-10-000-0050, 50-43-44-02-10-000-0060, 50-43-44-02-10-000-0010)\nBRICKELL BAY HOLDINGS LLC Holding company for 1201 BRICKELL BAY, LLC\nBRICKELL LEASING LLC See "Subordination, Non-Disturbance, and Attornment Agreement"; Miami-Dade Clerk\'s File No.: 2022 R 938960, Group: 1. Kenneth C Griffin is sole member.\nCAAM MANAGEMENT LLC https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm\nCAISLEAN CAPITAL LTD NFA Pool ID P113537, ceased trading 3/31/2016\nCALC III LP https://www.sec.gov/edgar/browse/?CIK=1582652\nCALC IV LP https://www.sec.gov/edgar/browse/?CIK=1423043\nCALC V LP Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf',
+        'Simulate a conversation between the writer of this post, named /u/CruxHub, and the expert business investigator. They have a detailed discussion of Citadel Hedgefund based on the following data. Do not include the following data in the search query. \n\nData: Entity Name Purpose / Source\n1|101 PC HOLDINGS LLC|Holding company for Penthouse C at the Setai Miami Beach (folio: 02-3234-153-1160)|PC = Penthouse C \n2|11 STAR ISLAND LLC|Holding company for 10 STAR ISLAND DR, MIAMI BEACH, FL 33139 (folio: 02-4204-001-0100, 02-4204-001-0110) (lots 10, 11 and 12 of Star Island)| \n3|117 EAST PARK AVENUE, LLC|Holding company for 117 E. PARK AVE, LIBERTYVILLE, IL (PIN: 11-21-212-046-0000); subsequently sold.| \n4|1201 BRICKELL BAY, LLC|Holding company for 1201 BRICKELL BAY DR, MIAMI, FL (folio no: 141390710010)| \n5|1221 BRICKELL, LLC|Holding company for 1221 BRICKELL AVE, 155 SE 13 ST, 165 SE 13 ST, 175 SE 13 ST, and 185 SE 13 ST, MIAMI, FL (folio: 01-4139-035-0010)| \n6|1221 BRICKELL HOLDINGS LLC|Holding company for 1221 BRICKELL, LLC| \n7|1229 PARK WEST AVENUE, LLC|Holding company for 1229 W. PARK AVE, LIBERTYVILLE, IL (PIN: 11-20-100-010-0000)| \n8|125 WORTH LLC|Delaware LLC (file 7218403), Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person; speculaton this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC, this property is next door (PCN: 50-43-43-23-05-016-0380)| \n9|125 WORTH HOLDINGS LLC|Delaware LLC (file 7218407); not registered to Florida yet but speculation this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC| \n10|1250 BB ASSET CO LLC|Holding company for 1250 BRICKELL BAY DR and 1260 BRICKELL BAY DR, MIAMI, FL (folio nos: 102100504250, 102100503210)|BB = Brickell Bay \n11|1330 SOUTH OCEAN LLC|Holding company for 1330 S OCEAN BLVD, PALM BEACH, FL (PCN: 50-43-44-02-11-000-0020)| \n12|14 STAR ISLAND LLC|Delaware LLC (file 3377653); incorporated 8/42020, withdrawn 10/10/2022; believe this was not used because 14 STAR ISLAND property was held by NAUTILUS HOLDINGS I LLC before sale on 10/5/2022| \n13|151 WORTH, LLC|Holding company for 151 WORTH AVE, PALM BEACH, FL 33480 (PCN: 50-43-43-23-05-016-0130); office space for Citadel (https://localtoday.news/fl/citadel-moves-into-palm-beachs-former-neiman-marcus-building-4821.html); sole member is 151 WORTH HOLDINGS LLC| \n14|151 WORTH HOLDINGS LLC|Holding company for 151 WORTH, LLC| \n15|16 WILLOW HOLDINGS LLC f/k/a PVNAH LLC|Holding company for S WILLOW COURT, ASPEN, CO (Parcel: 273511309030); see Pitkin Co. reception # 623002, Delaware certificate showing name change 9/1/2015| \n16|190 PFISTER HOLDINGS LLC f/k/a AH2013 HOLDINGS LLC|Holding company for 190 PFISTER DR, ASPEN, CO (parcel: 273511309029); see Pitkin Co.reception # 623000, Delaware certificate showing name change 9/1/2015| \n17|196 PFISTER HOLDINGS LLC|Holding company for 196 PFISTER DR, ASPEN, CO (parcel: 273511309028); see Pitkin Co. reception # 623501, statement of authority show KP HOLDINGS LLC as sole membe| \n18|1ALPH LLC|See ALPH LLC| \n19|1BUSINESS GROUP LLC|See BUSINESS GROUP LLC| \n20|1GFS DESIGN LLC|See GFS DESIGN LLC| \n21|1GFS LLC|See GFS LLC| \n22|1MEDIA HOLDINGS LLC|See MEDIA HOLDINGS LLC| \n23|23174 NE 41ST PATH LLC|Holding company for 23174 NE 41ST PATH #12, OKEECHOBEE, FL 34972 (Parcel: 1-01-35-35-0020-00000-0120); part of Pine Creek Sporting Club (www.pinecreeksportingclub.com) includes horse, shooting sports; sole member is KP HOLDINGS L.L.C.| \n24|3031 BRICKELL LLC|Holding company for 3031 BRICKELL AVE, MIAMI FL 33129 (Folio: 01-4139-001-2700); Sole member is KP HOLDINGS L.L.C.| \n25|31 WILLOW HOLDINGS LLC f/k/a AP HOLDINGS I LLC|Holding company for 31 NORTH WILLOW COURT, ASPEN, CO (Parcel: 273511309019); sold 7/6/2017; see Pitkin Co. reception # 623001, Delaware certificate showing name change 9/1/2015| \n26|650 CASUARINA LLC|Holding company for 650 CASUARINA CONCOURSE CORAL GABLES, FL (folio: 03-4132-019-0060) https://www.bizjournals.com/southflorida/news/2022/05/27/650-casuarina-concourse-coral-gables-sold.html|" \n27|650 MEADOW LANE 1 LP|Holding company for 650 MEADOW LANE, VILLAGE OF SOUTHAMPTON, NY (Parcel ID 7478) (https://archive.is/h85yq)| \n28|800 NORTH MICHIGAN HOLDINGS LLC|Holding company for 800 N MICHIGAN AVE, UNITS 66 PH and 67 PH, CHICAGO, IL (Park Tower) (PINs: 17-03-231-018-1116, 17-03-231-018-1117); sole member is KP HOLDINGS LLC (see Cook County, IL doc # 1933315025); recently sold| \n29|8565 OLD CUTLER LLC|Holding company for 8565 OLD CUTLER RD, MIAMI, FL (folio: 03-4132-019-0020)| \n30|9 WEST WALTON HOLDINGS LLC|Holding company for 9 WEST WALTON STREET CONDOMINIUM UNITS 3500, 3600, 3700, and PH, CHICAGO, IL| \n31|ADRP LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin|ADRP = Anne Dias Real Property? \n32|AH2013 HOLDINGS LLC|See 190 PFISTER HOLDINGS LLC|AH = Aspen Holdings? \n33|ALPH LLC a/k/a 1ALPH LLC|Formerly FAA registered plane N421AL| \n34|AP HOLDINGS I LLC|See 31 WILLOW HOLDINGS LLC|AP = Aspen Property? \n35|ARAGON INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_45631.pdf| \n36|ASHLER CAPITAL LLC|https://adviserinfo.sec.gov/firm/summary/148826| \n37|ASHLER CAPITAL MASTER FUND LTD|https://www.sec.gov/Archives/edgar/data/1003078/000114420418014250/tv488357\\_sc13g.htm| \n38|BANBURY LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n39|BANBURY II LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n40|BKGST LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n41|BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC|See BLOSSOM WAY HOLDINGS LLC|Black Calabash is a type of tropical tree: https://edis.ifas.ufl.edu/publication/ST079 \n42|BLACK WHEEL LLC|Illinois LLC, registered 3/5/2014, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin| \n43|BLOSSOM WAY HOLDINGS LLC f/k/a CPPB HOLDINGS LLC f/k/a BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC|Holding company for 10 BLOSSOM WAY, 70 BLOSSOM WAY, and 1265 S OCEAN BLVD PALM BEACH, FL (PCNs: 50-43-44-02-10-000-0050, 50-43-44-02-10-000-0060, 50-43-44-02-10-000-0010)| \n44|BRICKELL BAY HOLDINGS LLC|Holding company for 1201 BRICKELL BAY, LLC| \n45|BRICKELL LEASING LLC|See "Subordination, Non-Disturbance, and Attornment Agreement"; Miami-Dade Clerk\'s File No.: 2022 R 938960, Group: 1. Kenneth C Griffin is sole member.| \n46|CAAM MANAGEMENT LLC|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm|CAAM = Citadel Alternative Asset Management \n47|CAISLEAN CAPITAL LTD|NFA Pool ID P113537, ceased trading 3/31/2016| \n48|CALC III LP|https://www.sec.gov/edgar/browse/?CIK=1582652| \n49|CALC IV LP|https://www.sec.gov/edgar/browse/?CIK=1423043| \n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009|',
+        'Web search results:\n\n[1] "As per the Oxford Dictionary, a chatbot is defined as A computer program designed to simulate conversation with human users, especially over the internet. It can be looked upon as a virtual assistant that communicates with users via text messages and helps businesses in getting close to their customers."\nURL: https://www.datacamp.com/tutorial/building-a-chatbot-using-chatterbot\n\n[2] "Python , A chatbot is a computer program designed to simulate conversation with human users, especially over the internet. Create a fortune teller program that will ask the user to input a question and feedback some random answer. Consider the following feedback to be used. No idea at all! Better pray. The possibilities are in your favor."\nURL: https://www.chegg.com/homework-help/questions-and-answers/python-chatbot-computer-program-designed-simulate-conversation-human-users-especially-inte-q78825383\n\n[3] "It was created by Joseph Weizenbaum in 1966 and it uses pattern matching and substitution methodology to simulate conversation. The program was designed in a way that it mimics human conversation. The Chatbot ELIZA worked by passing the words that users entered into a computer and then pairing them to a list of possible scripted responses."\nURL: https://onlim.com/en/the-history-of-chatbots/\n\n[4] "Study with Quizlet and memorize flashcards containing terms like Which analytics does the following fall into: Alice notice that call center always have an increase in the number of customer complaints during last week in May, so she decides reviews the employees work schedule in the month of May for the past 5 years., Datasets continue to become, Model used for predictive analytic have ..."\nURL: https://quizlet.com/415587939/big-data-final-exam-flash-cards/\n\n[5] "As every bright side has a darker version, simulation of human conversation through AI also has some disadvantages like high cost of creation, unemployment, interaction lacking emotion, and out-of-the-box thinking. However, AI interaction tools are trained with a data set. The bigger the data set, the better the services."\nURL: https://www.analyticsinsight.net/simulating-human-conversations-through-ai/\n\n[6] "The eavesdropper, Eve intercepts the encrypted conversation and tries random keys with the aim of learning the conversation shared between Alice and Bob as shown in Fig. 7. For this POC, we used ..."\nURL: https://www.researchgate.net/figure/A-A-simulation-of-conversations-between-Alice-and-her-friend-Bob-B-The-eavesdropper\\_fig3\\_334408170\n\n[7] "Dreams are most often reported when sleepers wake from \\_\\_\\_\\_\\_ sleep. REM. The brain waves during REM sleep MOST closely resemble those seen during: waking consciousness. REM sleep is paradoxical because: the brain is active, but the major skeletal muscles are paralyzed. Fatigue and pain reflect deprivation of \\_\\_\\_\\_\\_ sleep."\nURL: https://quizlet.com/78519058/psyc-test-2-flash-cards/\n\n[8] "You can generate easily a fake group chat conversation like Whatsapp, Facebook or Telegram. After creating members/users, you can add messages in your chat. Once all messages are set up, you have the possibility to live-preview the chat conversation via the play button. Until the share functionality is ready, you have the option to screen ..."\nURL: https://chat-simulator.com/\n\n[9] "This is a program that allows the computer to simulate conversation with a human being: answer choices a. Speech Application Program Interface b. Chatbot c. Voice Recognition d. Speech Recognition Question 7 30 seconds Report an issue Q. This is a system of Programs and Data-Structures that mimics the operation of the human brain: answer choices a."\nURL: https://quizizz.com/admin/quiz/5f183913423fab001b0bd134/ai-unit-1\n\n[10] "This is a system of Programs and Data-Structures that mimics the operation of the human brain: answer choices a. Intelligent Network b. Decision Support System c. Neural Network d. Genetic Programming Question 8 30 seconds Q. Where is Decision tree used? answer choices a. Classification Problem b. Regression Problem c. Clustering Problem d."\nURL: https://quizizz.com/admin/quiz/5f6d6e4a6e2458001be385f5/ai-class-9\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: Simulate a conversation between Alice and /u/CruxHub. They talk about which company from the data batches is worth researching further into on the web.',
+        'Simulate a conversation between Alice and /u/CruxHub. They talk about which company from this data batch is worth researching further into on the web.\n\nData batch: Entity Name Purpose / Source Hypothesized Acronym\n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009| \n58|CE TM HOLDINGS LLC f/k/a KCG IP HOLDINGS LLC|Holding company for intellectual property (25 trademarks, 1 patent found so far)|CE TM = Citadel Enterprise Trademark Holdings \n59|CEF OFFSHORE HOLDINGS LTD|NFA Pool ID P131121| \n60|CEIF INTERNATIONAL LTD|NFA Pool ID P048476; http://registers.centralbank.ie/ICAVDocuments/C439830/Director%20Details%20Updated%2021.01.07%203.pdf| \n61|CEIF LLC|NFA Pool ID P048474| \n62|CEIF PARTNERS INTERNATIONAL LTD|NFA Pool ID P173278| \n63|CEIF PARTNERS LLC|NFA Pool ID P048475| \n64|CES SECURITIES CANADA ULC|See CITADEL SECURITIES CANADA ULC, CSA NRD # 49280| \n65|CFPS HOLDINGS S.\u00e0 r.l.|Luxembourg - B176936; 100% owned by CITADEL ENERGY INVESTMENTS LTD| \n66|CGE ALPHA LTD|NFA Pool ID P057309, ceased trading 6/7/2017| \n67|CGE ALPHA OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064400, ceased trading 4/30/2017| \n68|CGEF OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064406, ceased trading 2/21/2019| \n69|CGEF SPC|NFA Pool ID P064408, ceased trading 12/31/2012| \n70|CGMF OFFSHORE HOLDINGS LTD|NFA Pool ID P064410, ceased trading 3/31/2014| \n71|CGTS HOLDINGS S.\u00e0 r.l.|Luxembourg - B157777; 100% owned by TACTICAL TRADING HOLDING LTD; NFA Pool ID P064412, ceased trading 9/30/2014| \n72|CHARAXES MELVIN LLC|Sole member of CHARAXES MELVIN II LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n73|CHARAXES MELVIN II LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is CHARAXES MELVIN LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n74|CHI2LTV LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n75|CIG(E) LLP|See CITADEL EUROPE LLP| \n76|CIG CANADA ULC|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n77|CIG MEDIA LLC|https://www.sec.gov/Archives/edgar/data/923877/000114420407003635/v063478\\_sc-13d.htm| \n78|CITADEL AAM LP|https://www.sec.gov/Archives/edgar/vprr/0804/08040017.pdf| \n79|CITADEL AC INVESTMENTS LTD|https://www.sec.gov/Archives/edgar/data/1015780/000114420408032074/v115701\\_sc13da.htm| \n80|CITADEL ADVISORS EUROPE LIMITED f/k/a CITADEL MANAGEMENT (EUROPE) LIMITED f/k/a CITADEL HEDGE FUND SERVICES (EUROPE) LIMITED|https://find-and-update.company-information.service.gov.uk/company/10930267| \n81|CITADEL ADVISORS HOLDINGS LP|Sole member of CITADEL ADVISORS LLC; https://www.sec.gov/Archives/edgar/data/1567180/000110465922099806/xslF345X03/tm2225817-2\\_4.xml| \n82|CITADEL ADVISORS HOLDINGS II LP|https://www.sec.gov/Archives/edgar/data/1177609/000114420416082613/v429844\\_sc13ga.htm| \n83|CITADEL ADVISORS HOLDINGS III LP|https://www.sec.gov/Archives/edgar/data/1640129/000114420415043739/xslF345X02/v416000\\_3.xml| \n84|CITADEL ADVISORS LLC|NFA ID: 0391913; https://www.sec.gov/edgar/browse/?CIK=1423053| \n85|CITADEL ADVISORS II LLC|| \n86|CITADEL ADVISORS SINGAPORE PTE. LIMITED|| \n87|CITADEL ALTERNATIVE ASSET MANAGEMENT LP|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm| \n88|CITADEL AMERICAS LLC|| \n89|CITADEL AMERICAS SERVICES LLC|| \n90|CITADEL ANTAEUS INTERNATIONAL INVESTMENTS LTD|| \n91|CITADEL ASIA ASSET HOLDING LIMITED|http://registers.centralbank.ie/ICAVDocuments/C157189/Director%20Details%20Updated%2016.10.31%202.pdf| \n92|CITADEL ASIA LIMITED f/k/a CITADEL (HONG KONG) LIMITED|https://adviserinfo.sec.gov/firm/summary/148826| \n93|CITADEL CANDLESTICK EIF LLC|| \n94|CITADEL CANTERBURY S.\u00e0 r.l.|Luxembourg - B87988; 100% owned by CITADEL TONBRIDGE S.\u00e0 r.l.| \n95|CITADEL CEFL CHINA LTD|NFA Pool ID P148073| \n96|CITADEL CEFL INVESTMENTS LTD|NFA Pool ID: P161763; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n97|CITADEL CEIT CHINA LTD|| \n98|CITADEL CEMF CHINA LTD|https://find-and-update.company-information.service.gov.uk/company/02263951/charges/x6zPQSYGNpuDNgxU1cFQlCS0iog| \n99|CITADEL CEMF INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n100|CITADEL CEMF SPV LTD f/k/a CITADEL INVESTMENT MASTER FUND LTD|See CITADEL INVESTMENT MASTER FUND LTD; https://opencorpdata.com/lei/LF0U6QUBXKIO573GXS38|',
+        'Simulate a conversation between Alice and /u/CruxHub. /u/CruxHub asks Alice to anlalyze a data batch for non-standard insights.\n\nData batch: Entity Name Purpose / Source Hypothesized Acronym\n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009| \n58|CE TM HOLDINGS LLC f/k/a KCG IP HOLDINGS LLC|Holding company for intellectual property (25 trademarks, 1 patent found so far)|CE TM = Citadel Enterprise Trademark Holdings \n59|CEF OFFSHORE HOLDINGS LTD|NFA Pool ID P131121| \n60|CEIF INTERNATIONAL LTD|NFA Pool ID P048476; http://registers.centralbank.ie/ICAVDocuments/C439830/Director%20Details%20Updated%2021.01.07%203.pdf| \n61|CEIF LLC|NFA Pool ID P048474| \n62|CEIF PARTNERS INTERNATIONAL LTD|NFA Pool ID P173278| \n63|CEIF PARTNERS LLC|NFA Pool ID P048475| \n64|CES SECURITIES CANADA ULC|See CITADEL SECURITIES CANADA ULC, CSA NRD # 49280| \n65|CFPS HOLDINGS S.\u00e0 r.l.|Luxembourg - B176936; 100% owned by CITADEL ENERGY INVESTMENTS LTD| \n66|CGE ALPHA LTD|NFA Pool ID P057309, ceased trading 6/7/2017| \n67|CGE ALPHA OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064400, ceased trading 4/30/2017| \n68|CGEF OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064406, ceased trading 2/21/2019| \n69|CGEF SPC|NFA Pool ID P064408, ceased trading 12/31/2012| \n70|CGMF OFFSHORE HOLDINGS LTD|NFA Pool ID P064410, ceased trading 3/31/2014| \n71|CGTS HOLDINGS S.\u00e0 r.l.|Luxembourg - B157777; 100% owned by TACTICAL TRADING HOLDING LTD; NFA Pool ID P064412, ceased trading 9/30/2014| \n72|CHARAXES MELVIN LLC|Sole member of CHARAXES MELVIN II LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n73|CHARAXES MELVIN II LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is CHARAXES MELVIN LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n74|CHI2LTV LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n75|CIG(E) LLP|See CITADEL EUROPE LLP| \n76|CIG CANADA ULC|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n77|CIG MEDIA LLC|https://www.sec.gov/Archives/edgar/data/923877/000114420407003635/v063478\\_sc-13d.htm| \n78|CITADEL AAM LP|https://www.sec.gov/Archives/edgar/vprr/0804/08040017.pdf| \n79|CITADEL AC INVESTMENTS LTD|https://www.sec.gov/Archives/edgar/data/1015780/000114420408032074/v115701\\_sc13da.htm| \n80|CITADEL ADVISORS EUROPE LIMITED f/k/a CITADEL MANAGEMENT (EUROPE) LIMITED f/k/a CITADEL HEDGE FUND SERVICES (EUROPE) LIMITED|https://find-and-update.company-information.service.gov.uk/company/10930267| \n81|CITADEL ADVISORS HOLDINGS LP|Sole member of CITADEL ADVISORS LLC; https://www.sec.gov/Archives/edgar/data/1567180/000110465922099806/xslF345X03/tm2225817-2\\_4.xml| \n82|CITADEL ADVISORS HOLDINGS II LP|https://www.sec.gov/Archives/edgar/data/1177609/000114420416082613/v429844\\_sc13ga.htm| \n83|CITADEL ADVISORS HOLDINGS III LP|https://www.sec.gov/Archives/edgar/data/1640129/000114420415043739/xslF345X02/v416000\\_3.xml| \n84|CITADEL ADVISORS LLC|NFA ID: 0391913; https://www.sec.gov/edgar/browse/?CIK=1423053| \n85|CITADEL ADVISORS II LLC|| \n86|CITADEL ADVISORS SINGAPORE PTE. LIMITED|| \n87|CITADEL ALTERNATIVE ASSET MANAGEMENT LP|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm| \n88|CITADEL AMERICAS LLC|| \n89|CITADEL AMERICAS SERVICES LLC|| \n90|CITADEL ANTAEUS INTERNATIONAL INVESTMENTS LTD|| \n91|CITADEL ASIA ASSET HOLDING LIMITED|http://registers.centralbank.ie/ICAVDocuments/C157189/Director%20Details%20Updated%2016.10.31%202.pdf| \n92|CITADEL ASIA LIMITED f/k/a CITADEL (HONG KONG) LIMITED|https://adviserinfo.sec.gov/firm/summary/148826| \n93|CITADEL CANDLESTICK EIF LLC|| \n94|CITADEL CANTERBURY S.\u00e0 r.l.|Luxembourg - B87988; 100% owned by CITADEL TONBRIDGE S.\u00e0 r.l.| \n95|CITADEL CEFL CHINA LTD|NFA Pool ID P148073| \n96|CITADEL CEFL INVESTMENTS LTD|NFA Pool ID: P161763; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n97|CITADEL CEIT CHINA LTD|| \n98|CITADEL CEMF CHINA LTD|https://find-and-update.company-information.service.gov.uk/company/02263951/charges/x6zPQSYGNpuDNgxU1cFQlCS0iog| \n99|CITADEL CEMF INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n100|CITADEL CEMF SPV LTD f/k/a CITADEL INVESTMENT MASTER FUND LTD|See CITADEL INVESTMENT MASTER FUND LTD; https://opencorpdata.com/lei/LF0U6QUBXKIO573GXS38|',
+        'Web search results:\n\n[1] "Katherine Burton Hedge fund titans Ken Griffin and Steve Cohen boosted Gabe Plotkins Melvin Capital, injecting a total of $2.75 billion into the firm after it lost about 30% this year. Citadel..."\nURL: https://www.bloomberg.com/news/articles/2021-01-25/citadel-point72-to-invest-275-billion-in-melvin-capital\n\n[2] "NEW YORK, Jan. 25, 2021 /PRNewswire/ -- Melvin Capital Management (Melvin) today announced that Citadel and its partners and Point72 have made investments into its fund. I am incredibly..."\nURL: https://www.prnewswire.com/news-releases/melvin-announces-2-75-billion-investment-from-citadel-and-point72--301214477.html\n\n[3] "Citadel LLC is further paring back its $2 billion investment in Melvin Capital Management after the hedge fund stumbled in its effort to recover from a near collapse triggered by surges in..."\nURL: https://www.wsj.com/articles/citadel-is-further-paring-back-2-billion-melvin-investment-11645710666\n\n[4] "Citadel and Steven A. Cohen s Point72 Asset Management together invested $2.75 billion into Melvins hedge fund on Jan. 25 as Melvin was hemorrhaging money. In return for the rare..."\nURL: https://www.wsj.com/articles/citadel-to-redeem-about-500-million-from-melvin-capital-11629550410\n\n[5] "CHARAXES MELVIN LLC is an Active company incorporated on August 5, 2022 with the registered number M22000012341. This Foreign Limited Liability company is located at SOUTHEAST FINANCIAL CENTER, 200 S. BISCAYNE BLVD., SUITE 3300, MIAMI, 33131 and has been running for one year. ... CITADEL SECURITIES GP LLC; KCG SPACE HOLDINGS LLC;"\nURL: https://bisprofiles.com/fl/charaxes-melvin-m22000012341\n\n[6] "Now, Citadel is taking some of its money back. Citadel has notified Melvin of its plans to retrieve $500 million of the $2 billion it injected in late January, according to two people briefed..."\nURL: https://www.nytimes.com/2021/08/21/business/citadel-melvin-gamestop.html\n\n[7] "Robinhood and Citadels relationship comes into focus as Washington vows to examine stock-market moves Trading firms at center of Reddit-fueled stock surges have worked closely to share..."\nURL: https://www.washingtonpost.com/business/2021/01/29/robinhood-citadel-gamestop-reddit/\n\n[8] "Alongside hedge funds such as Melvin Capital, Citron Capital, Point72, D1 Capital Partners, and Candlestick Capital Management; Citadel LLC was, the lawsuit claims, taking up short positions against the securities that retail investors were longing. This alleged conflict of interest is at the core of the class action lawsuit."\nURL: https://tokenist.com/new-lawsuit-alleges-citadel-conspired-with-robinhood-to-limit-gme-trading/\n\n[9] "Melvin later attracted an additional $3.2 billion in fresh cash, and the firm had $11.7 billion in assets at the beginning of this year. Point72 hasnt redeemed its investment, a person familiar ..."\nURL: https://www.chicagobusiness.com/finance-banking/ken-griffins-citadel-pulling-back-most-its-2-billion-melvin-capital-investment\n\n[10] "CHARAXES MELVIN II LLC branch. Company Number M22000012338 Status Active Incorporation Date 5 August 2022 (2 months ago) Company Type Foreign Limited Liability Jurisdiction Florida (US) Branch Branch of CHARAXES MELVIN II LLC (Delaware (US)) Agent Name C T CORPORATION SYSTEM Agent Address"\nURL: https://opencorporates.com/companies/us\\_fl/M22000012338\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is Charaxes Melvin LLC\'s relationship to Citadel?',
+        'Web search results:\n\n[1] "Federal authorities are investigating the market-making arms of Citadel LLC and KCG Holdings Inc, looking into the possibility that the two giants of electronic trading are giving small investors ..."\nURL: https://www.reuters.com/article/usa-stocks-probe-idUSL2N1871ZV\n\n[2] "Today, KCG is second only to Citadel in the market for handling stock order flow from retail brokerage firms. KCG and many other high-frequency trading firms have shied away from the public..."\nURL: https://www.ibtimes.com/citadel-llc-kcg-holdings-kcg-market-making-arms-probed-federal-authorities-over-stock-2366805\n\n[3] "Citadel Securities, a group owned by the Chicago-based hedge fund, is the would-be acquirer in the deal, the people said. The group is best known for its so-called wholesaler business that..."\nURL: https://www.wsj.com/articles/market-making-arm-of-citadel-llc-in-talks-to-buy-seats-on-nyse-floor-from-kcg-holdings-1454533971\n\n[4] "Citadels share of the wholesale market is around 34 per cent compared to KCGs 25 per cent, according to Tabb Group. Virtu has yet to lay out in detail its plans for the wholesale business ..."\nURL: https://www.ft.com/content/e1cb396e-29a7-11e7-bc4b-5528796fe35c\n\n[5] "Citadel Securities, a liquidity providers and market maker, announced it will purchase KCG Holdings designated market maker (DMM) business at the New York Stock Exchange. This will establish Citadel Securities as the DMM with the largest footprint on the NYSE, responsible for trading in approximately 1,500 issues."\nURL: https://www.tradersmagazine.com/departments/brokerage/citadel-purchases-kcg-dmm-business-becomes-1-on-nyse/\n\n[6] "isCitadel LLC and its related entity, KCG IP Holdings, LLC (Complainant), represented by Paul D. McGradyof Winston Strawn, Illinois, Respondent is- (Respondent), Alabama, USA. REGISTRAR AND DISPUTED DOMAIN NAME The domain name at issue iscitidelgroup.com, registered with TUCOWS, INC. PANEL The"\nURL: https://www.adrforum.com/domaindecisions/1522837.htm\n\n[7] "KCG SPACE HOLDINGS LLC is an Active company incorporated on July 21, 2022 with the registered number M22000011413. This Foreign Limited Liability company is located at 200 S BISCAYNE BLVD STE 3300, MIAMI, FL, 33131, US and has been running for one year. It currently has one Authorized Person. KEY FACTS ABOUT KCG SPACE HOLDINGS LLC US Businesses"\nURL: https://bisprofiles.com/fl/kcg-space-holdings-m22000011413\n\n[8] "The Complainant KCG IP Holdings LLC is the owner of US Trademark Registration No. 3,213,943, filed October 18, 2004, registered February 27, 2007, claiming first use dating back to 1994. Therefore, the Panel concludes that Complainants filing and registration of the CITADEL mark with the USPTO sufficiently demonstrates that it has rights in ..."\nURL: https://www.adrforum.com/domaindecisions/1579141.htm\n\n[9] "The KCG SPACE HOLDINGS LLC principal address is 200 S BISCAYNE BLVD STE 3300, MIAMI, 33131. Meanwhile you can send your letters to 200 S BISCAYNE BLVD STE 3300, MIAMI, FL, 33131. The company`s registered agent is C T CORPORATION SYSTEM 1200 SOUTH PINE ISLAND ROAD, PLANTATION, FL, 33324. The company`s management are A, President - Beeson Gerald A."\nURL: https://florida.intercreditreport.com/company/kcg-space-holdings-llc-m22000011413\n\n[10] "Billionaire Ken Griffin has built Citadel Securities into a trading and asset management colossus. ... and KCG Holdings. Last month, Citadel Securities reached an agreement with the SEC to pay $22 ..."\nURL: https://www.chicagobusiness.com/article/20170203/NEWS01/170209978/chicago-billionaire-ken-griffin-splits-citadel-into-two-companies\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is KCG Space Holdings LLC\'s relationship to Citadel?',
+        'Web search results:\n\n[1] "Citadel LLC (formerly known as Citadel Investment Group, LLC) is an American multinational hedge fund and financial services company. Founded in 1990 by Ken Griffin, it has more than $50 billion in assets under management as of May 2022. [1]"\nURL: https://en.wikipedia.org/wiki/Citadel\\_LLC\n\n[2] "NASHVILLE, Tenn. and BRONXVILLE, N.Y. \u2014 Standard Media Group LLC (Standard Media) and Citadel Communications LLC (Citadel) jointly announced today that they have reached an agreement pursuant to which Standard Media will acquire from Citadel WLNE-TV, the ABC affiliate for the Providence, RI - New Bedford, MA market (DMA 52) and KLKN (TV), the \u2026"\nURL: https://www.standardmedia.com/2019/05/16/standard-media-group-to-acquire-citadel-stations/\n\n[3] "CITADEL MEDIA LLC. Citadel Media LLC is a New Hampshire Domestic Limited-Liability Company filed on February 6, 2021. The companys filing status is listed as Not In Good Standing and its File Number is 862423. The Registered Agent on file for this company is Peter Alan Gauthier and is located at 3 Maple Ridge Drive Unit 224, Merrimack, NH 03054."\nURL: https://www.bizapedia.com/nh/citadel-media-llc.html\n\n[4] "CITADEL MEDIA LLC is a Michigan Domestic Limited-Liability Company filed on November 16, 2017. The companys filing status is listed as Active and its File Number is 802132896. The Registered Agent on file for this company is Registered Agents Inc. and is located at 2222 W. Grand River Ave Ste A, Okemos, MI 48864. The companys mailing address ..."\nURL: https://www.bizapedia.com/mi/citadel-media-llc.html\n\n[5] "Citadel Broadcasting Corporation was a Las Vegas, Nevada -based broadcast holding company. Citadel owned 243 radio stations across the United States and was the third-largest radio station owner in the country. Only iHeartMedia and Cumulus Media owned more stations prior to Citadels merger with Cumulus."\nURL: https://en.wikipedia.org/wiki/Citadel\\_Broadcasting\n\n[6] "Citadel is one of the largest hedge fund managers in the world. And theyve subsequently managed Melvin Capital to the ground. Melvin Capital suffered a loss of over 50% its first quarter in 2021 due to shorting AMC Entertainment and GameStop. At some point youd expect your clearing house to raise awareness on your risk management right?"\nURL: https://franknez.com/citadel-loses-billions-hedge-funds-are-getting-dragged-down/\n\n[7] "At our core, Citadel is built to deliver excellence. We have some of the most talented and focused minds in the industry, and we activate their ideas and strategies through a robust range of proven technologies and execution capabilities. View Top Employees from Citadel LLC Looking for a particular Citadel LLC employees phone or email? Find Info"\nURL: https://rocketreach.co/citadel-llc-profile\\_b5c46522f42e0dc2\n\n[8] "# 1 Most profitable hedge fund manager of all time Source: LCH Investment NV estimates, Top Hedge Fund Managers by Net Gains Since Inception as of 12/31/2022. Our people are relentless in seeking a better way. Each day, we reimagine and refine our strategies, models and technology in pursuit of superior results and long-term performance."\nURL: https://www.citadel.com/\n\n[9] "We are one of the most significant alternative investment managers in the public U.S. corporate credit markets. Explore Credit Convertibles Equities Equities represents one of the largest and longest tenured businesses at Citadel. Explore Equities Global Fixed Income Macro We are a leading fixed income and macro business."\nURL: https://www.citadel.com/what-we-do/\n\n[10] "Citadel. 203,101 followers. 1mo. Last weekend, we celebrated Citadels 30th anniversary at an incredible event at Disney World and Universal Studios. Our founder and CEO Ken Griffin summarized ..."\nURL: https://www.linkedin.com/company/citadel-llc\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is CITADEL MEDIA LLC?',
+        "What are the differences between the Dogme approach to language learning and the lexical approach to language learning",
+        "Implement my own netfilter in linux with linux kernel module with Rust",
+        "Damage to which nerve causes numbness of the palmar surface of the 5th digit/little finger",
+        "Explain the fault-tolerance of the reaction control system on the Space Shuttle",
+        "Hi, can you help me download 2000 portrait sketch images from Pinterest website with resolution at least 512 \\* 512? using python code",
+        "Tell me about the negatives of farming meat",
+        "what is the photograph filter called where the only part of the image is greyscale",
+        "I want some geological database structure with some example data for practicing my SQL query skills. Would you generate that for me?",
+        "What is a formal but simplified explanation of Web marketing",
+        "Rewrite and improve this story: Well, I have always liked helping people since I was a small child, I have been accused many times of giving too much away for free, but I find joy in helping others put the pieces together to reach their goals. As a Licensed Professional Counselor and Life Coach that is my job to impact individuals and help clients work through emotional difficulties and reach goals. But I will be honest with you I was selling the dream but not always living the dream. I had issues I had not worked completely through like childhood trauma, heartbreak, disappointments, and frustrations with life. Don't get me wrong I had the husband, the kids, the house and the 6 figure job but I was not happy inside, but I didn't change because I hate change, most of us hate change, right? Then I lost my sister, my friend, and it slapped me in the face that I need to take care of myself. I saw the addiction, I saw her not taking care of herself and I could not save her. One thing I know for sure, if you do not make your wellness a priority illness will find you. I remember the moment we lost her, the earth stood still and then my heart broke into pieces, what was I going to do, I have loved her my whole life! It was months later that I made a decision that I would be the change I hope to see, I would create a space for women of color to move past the obstacles that keep us from creating the life we want and Brown Suga Wellness was born. I am on this journey and I invite you to be on this journey with me! I love this quote by Oludara Adeeyo: \"When you heal yourself, you create an earth shattering legacy. The lineage of women who come after you will be healed. Your inner circle of Black women around you, healed.\" When you choose yourself you break generational trauma and curses. You activate your ancestral strength. I invite you to activate that strength!",
+        "How would you ask these questions: Tell everyone a little about you, where you from, what you like doing?\nWhat goals are you pursuing right now?\nWho has made the most influence in your life?\nWho is the one person that you admire the most (alive or dead)?\nWhat is the hardest challenge you\u2019re had to overcome in your life?\nWhen have you grown the most in your life and what caused that growth?\nWhere is your favorite place to relax and renew?\nWhat books have changed your life the most?\nWhat Is the biggest thing that you want the audience to take away today?\nHow can people get a hold of you to talk about your business?",
+        "Take these topics into a numbered table and generate subtopics in seperated lines for each. Preconfigure these subtopics as lections of those several topics and add them to the table. Use numbers for topics and letters for subtopics. Set a status (untouched/touched) for every subtopic in 3. coloumn of the table to mark them done when finished learning this subtopic and topic. Use coloumn 4 of the table for a short resumee of the chapter. Showing the learning process in percentage in front of every new output is first. Show the Table and wait for any userinput to start lessons on those topics.;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;",
+        "Write a rap song about Mikkel Selko",
+        "list the largest outdoor retailers in the world",
+        "can you create a wordpress shortcode to include the following code from facebook sdk",
+        'Is this grammatically correct: "It only took 5 years, and while we still have a long way to go, Topher\u2019s Farm has found its place with unique experience and offering of organic produce. "',
+        "Hello friend. My task for today is to engage in a debate with you. Will you humor me in this regard?",
+        "You are an expert marketing consultant and copywriter with expertise is direct response marketing. I need your help. Can I tell you about my business?",
+        'here is part 1\n\n----\nDaySculpting is a program that that deals with YOUR immediate future\u2026.It is a 90 day program that teaches U how to create Success\u2026 one day at a time\u2026today\u2026\nUsing recent breakthroughs in the field of neuroscience, the study of the human brain, DaySculpting is one of the most powerful success systems on earth for creating what I call\u2026 \n"Your Epic Ideal Day" -- And when U have Epic Ideal Days? U create your EPIC IDEAL LIFE.\n\nDaySculpting is broken down into 3 easy to accomplish segments throughout your day\u2026\n~The Morning Lift Process\u2026which sets U up with a MindState of Success and a design for U to follow throughout your day\u2026There is a morning email\u2026SMS text\u2026Inspiring Video\u2026Future Forward Tuning IN\u2026And a 3 step Success Step Declaration Process\u2026this only takes 15 minutes\u2026\n~Mid-Day Reconnect Process\u2026whatever your miid-day is\u2026U are encouraged to stop doing what U are doing and disconnect so U can re-connect\u2026by listening to a 5-minute Tuning In Re-Connection. We know that somewhere in the middle of our day it\u2019s easy to lose momentum and drift from our best intentions because of all the demands on our attention. It has been scientifically proven that when U disconnent for between 3 to 5 minutes at the midpoint of your day\u2026.your brain resets\u2026and your energy is replenished\u2026I like to call it a MindState Re-Boot that will inspire U to re-ignite your imagination\u2026this only takes 5 minutes\n~Highlight And Insight Review Process\u2026we all review our day however what DaySculpting \nanchors for U is an activation and integration process that gets U to see your day as being successful\u2026by celebrating your successes (your highlights) and being present to things U could have improved on (your insights) so U can make your insights into highlights..most people when they review their day fail to celebrate even the smallest increments of success\u2026they focus on what they didn\u2019t do and that puts them in a negative energy\u2026Success has challenges and the\nhighlights and insight process encourages and empowers U to honestly see what U are doing each day so U Sculpt new MindStates Of Success rather than the energy of uncertainty\u2026\nthis takes 10 minutes\n\nThe whole DaySculpting process takes 30 minutes a day\u2026and as I always say if U don\u2019t have \n30 minutes to change your life then U don\u2019t want to change your life and U are okay with living \na mediocre life\u2026\n\nDay Sculpting is about targeting specific Chief Aims U have for your life\u2026and creating the Habits that will get U there\u2026Imagine being able to replace the MindTraps (your limiting beliefs) with empowering rituals and habits that become your new normal\u2026\n\nThrough the repetition of doing the daily DaySculpting process U are carving into your Subconscious memory thoughts, beliefs and actions that result in U sculpting the masterpiece known as U\u2026\n\nThere are many programs out there that attempt to instill new success behaviors however many fall short of actually shifting your MindStates into a frequency of possibility where U get to actually see your daily results immediately\u2026DaySculpting does this\u2026\n\nThis is not science fiction\u2026 and it\'s not wishful thinking, or some tired old self-improvement, goal-setting program\u2026 DaySculpting is a program that empowers U to manifest and realize your Chief Aims in life\n\n"DaySculpting" -- is a tool that takes just MINUTES a day for you to use\u2026\n\nIt is designed to FREE UP hours in your day\u2026 while at the SAME time empowering you for greater success in ANY area of your life.\n\nDaySculpting sheds light and solves an age-old problem:\nWHY we often fight against the very changes we desire to make\n\nHave you ever experienced the FEELING that you deserve MORE out of your life? More financial freedom and greater rewards from the hard work you do every day? Deeper, more empowering relationships with those you love\u2026 or maybe just meeting that special someone to share your life with? Perhaps you crave a deeper spiritual connection\u2026 or a more healthy, trim, energetic body?\u2026 \nYET:\nDespite your BEST intentions\u2026 you struggle. Perhaps if you\'re anything like me, you even self-sabotage your results with actions that you KNOW are not in your best interest.\n\nMaybe it FEELS like it did for me: Like you are swimming upstream\u2026 making SOME progress, sure, but just not reaching your goals and desires fast enough.\n\nWell, I have wonderful news for you: It\'s not because you\'re lazy\u2026 and it\'s not because you are not smart enough, competent enough\u2026 or ANYTHING enough! \n\nThe real REASON you desire more and are not seeing ALL the results you deserve lies within whether the Success Switch in your brain is in the ON or OFF position\u2026\n\nThe SOLUTION\u2026 THE ANSWER to flipping your Success Switch back ON lies within the simple daily steps U will take when U experience the DaySculpting Program\u2026 \nThe Day Sculpting Program Is A Simple Step Daily Success RITUAL \u2028 That Shuts Down Your Body\'s Failure Reflex \u2028 So YOU Tap Into Your Brains Success Centers\u2026\u2028 In Just Minutes A Day!\u2028\u2028 IIMAGINE Knowing What HIGHLY SUCCESSFUL \u2028 People Do EVERYDAY\u2026\nFor Abundance And Wealth, Greater Health, Self-Confidence Meaningful Relationships, Sharper Focus , Deeper Joy\u2026\u2028 And So Much More\u2026\n\u201cNow You Too Can Use This 90-Day Game Changer\u2028 To Tap Into The Key Success Centers Of Your Mind,\u2028 And In Just Minutes You Can Transform Even Lousy Days\u2028 Into Days Filled With The Results You Desire \u2013 Guaranteed!\u201d\nTO MAKE A GREAT LIFE, ALL YOU HAVE TO IS MAKE EACH DAY A GREAT DAY \u2026 \nThen get up tomorrow and do the same thing, day after day after day.\nARE YOU Ready To Change YOUR LIFE One Day At A Time\u2026\nThe comprehensive, fun and empowering 90-day DaySculpting program provides you with the life skills and tools to help you master a new MindState of Success and a range of powerful life-changing rituals and habits that will Sculpt Your Perfect Days Into A Great Life.\nDAY SCULPTING WILL TEACH YOU:\n\u2022 The science behind HAVING A MindState Of Success...and why most people who want more in life actually have their success switch turned off by total accident!\n\u2022 How to get more done with more time and more energy left over!\n\u2022 The simple, yet powerful, process of building a powerful day so you create a series of "Dynamic Days" - days that will end up building your most incredible life (The one you always thought was out of reach!)\n\u2022 Learn the \'Day Sculpting Principles\'. These can have a huge impact on you your life, but when you learn how simple they really are, you can use them easily and consistently!\n\u2022 How in just a few minutes a day, you can keep positive results flowing and put your success energy into a permanent \'ON\' position!\n\u2022 And much more!\nDaySculpting, is for those who are willing to take their life to the next level by creating new Success Habits replacing the ones that have been sabotaging your success. \nSo make sure you can honestly agree with the following before experiencing DaySculpting:\n\u2022 You desire more out of life, yet feel as if you are "missing something" -- that special "X Factor" to take you to the next level?\n\u2022 You are brave enough to boldly say, "I want greater wealth and financial freedom... and I demand the best lifestyle possible for me and my family!\n\u2022 You know the value of joy: You want to experience greater happiness, peace of mind, and connection with your friends and loved ones on a daily basis.\nIf you agree with the above, and truly want to create the best life possible, with greater wealth, freedom, happiness, love, and fulfillment, then I invite you to experience the power of Day Sculpting \u2026it will change the way you think about creating your day and the life you dream about. \nI am not encouraging you to become busier but rather to use your mental and emotional, energy more elegantly sculpting your day the way you want it to be. \nHow many times have you done a ton of work and still felt that you didn\u2019t accomplish what you really wanted for yourself. Week after week, month after month go by and you still are no farther ahead of the game\u2026stuck in the status quo that never seems to change.\n\nBreaking free means that the status quo of your life has to change\u2026 your habits of expectation have to change \u2026your mindset has to change\u2026you have to uncover those old behaviors that have held you back and be willing to create a new mindset.\n\nYou have to be willing to shift your daily focus inwards towards what you need to do today rather than tomorrow. Because when you create a great day today you welcome in a more powerful tomorrow.\n\nWe all have the same 24 hours each day. But why are some people building fabulous careers, achieving healthy lifestyles, enjoying great relationships and incomes, living their passions, and creating what they truly desire as a life?\n\nImagine that you could clear away the distractions that you unconsciously create. You know the stuff that consumes your time causes stress and disconnects you from your purpose and passion. \n\nImagine every day you embrace the energy for what you are choosing to create in your life. Your thoughts empower you, your choices inspire you and your actions create momentum, opportunity and possibility.\n\nYou can create a GREAT LIFE, the life you want to live by focusing your efforts on Creating a Great Day Today. That\u2019s Day Sculpting. Seven intentional sculpted days turn into a month of wonderful weeks and a year of magnificent months creating an amazingly successful life.\n\nNone of this is going to work though if you believe that what you were born with is all you will get\u2026\n\nNo one will ever attempt to do something when they are convinced that they will fail.\n\nResearch has shown that the brain will actually stop itself from doing what\u2019s necessary to succeed if a person believes that they cannot succeed.\n\nIt\u2019s the small concrete indicators of success today that will prove you can have whatever it is you want and the process of Day Sculpting will empowers, inspire and motivates you each step of the way.\n\nYou see: Confidence + Discipline = Desired Outcomes \n\nIt\u2019s time to stop looking at your life from a fear based I don\u2019t know how to mindset but rather be open to creating a solutions focused change consciousness that embraces your gift and talents and encourages you sharing them.\n\nLet me share a bit of nuero-chemistry with you\u2026\nWhat fires together wires together\u2026\n\nSo rather than Fall back on old habits\u2026\nTake the transitional step\u2026of being fully present to whats trying emerge as your ideal future and to help it along start building confidence each day\u2026\n\nAnd your possibility muscle and an intended thought process that leads to a more focused and clear out picturing of your desires.\n\nYou see...It\u2019s one thing to set goals and to make to do lists and to say your going to use the law of attraction to manifest what you want in life\u2026\n\nI\u2019m still looking at the many lists I have created.\n\nWhat it\u2019s really about is having a clear and purposeful intention in order to create the energy and the MindState Of success that will propel you into action.\n----\n\nWhen done ask me for part 2',
+        "Here is the final part. Part 3\n---\n\nHere we will be showing how the principles and practices we\u2019ve covered so far converge into one over-arching result that will benefit you for the rest of your life. You can think of it as flipping a switch that changes how you create new results in life one day at a time. This is at the very core of what we call Day Sculpting. \nThe simplest way to think of it is that most of the way we live is habitual. You have an habitual way of brushing your teeth, walking, talking to yourself and others, eating, working. Habits are wonderful\u2026they make life easy but they also limit you. For example, if you have a habit of eating too much, you\u2019ll put on weight. Not instantly, but steadily, day by day, until one day you have a weight problem. If you try to change your weight quickly through a trendy new diet, research shows that the weight is likely to come back, and then some, within a few short months, because the habits required to live at your ideal weight have not been properly established. \nHabits are habits because you don\u2019t think about them, they happen nonconsciously. If you want a change in your life, you have to embody the change at a nonconscious level, so that the habits keeping your life the way it is today begin to shift.\nWouldn\u2019t it be great if there was a switch in the brain that would move you from status quo to status GO!? This is a switch that once you flip it will produce the result you want, if you are willing to commit to and stay with the process.Day Sculpting is your guide to fully realizing the success you are ready to enjoy.\nA critically important capacity of the human mind called preconscious processing. This is the ability of the mind to receive information, beneath our conscious awareness, and act upon it without even knowing that it is happening. Used correctly, this is an amazing power. Used improperly, it will sabotage your best efforts and make life extremely difficult.\nMost of us think we are running the show with our conscious awareness, consciously choosing our thoughts, behaviors, and emotions and consequently, we believe are able to choose the results we create in life. However, what neuro-science research shows, is that we all have a vast nonconscious mind that is really running the show most of the time. That deeper part of us, in charge of our habitual thinking, feeling, and behaving is always operating in our best interest. But it does so using information that may be faulty or outdated. If you continue to feed it information that doesn\u2019t serve you, it will continue to habitually bring results that are less than desired.\nYour preconscious processor is constantly routing new information directly into this larger database that your mind uses to create new behaviors. Your job is to place the right information into this database every single day, so that it can draw upon this new data and create new results. It requires your vigilance and purposeful intention on a daily basis. Day Sculpting is the process to accomplish exactly that, getting you to focus one day at a time on what you are trying to create in your life today, and the future you truly desire. \nA lot of experts in the human development field teach information and then expect it will translate into new behaviors automatically. But as we\u2019ve pointed out, and as you\u2019ve probably experienced, consciously knowing something and having the nonconscious mind put it into a new behavior, are two entirely different processes. What we are sharing with you is how to bridge that gap. This is precisely why so many experts in the field are recommending Day Sculpting to their clients, to help them use momentum mindsets on a daily basis and apply the good information they teach. \nWe talk about The The Solutions Focus process . Try it out: \nThink of an area of your life in which you are actively attempting to create different results. Imagine your chief aim regarding this area of your life as a perfect future. Now imagine a scale from one to ten, where ten is the perfect future and one is that you have not even started thinking about your chief aim. On this imaginary scale from 1 to 10, where would you place yourself right now?\nGo ahead and imagine where would you place yourself right now on that scale, where ten is your perfect future.\nWhatever number you came up with is fine. Whether it was 3 or 7, whatever you came up with I\u2019ll always ask the same next question. \u201cWhy so high and not lower?\u201d\nLet\u2019s say, for example that you came up with a three. Asking the question \u201cWhy so High\u201d catches the mind off guard. Most people expect, \u201cOnly a 3! Why so low?\u201d If I had asked that what would you come up with? All the reasons why things aren\u2019t working, who is to blame, problems, excuses, lack, limitations, and so on. \nBut when I ask \u201cWhy so high?\u201d the brain immediately begins to sort for all of the things that are working for you, everything that has brought you up to a \u201cthree.\u201d If you said you are at a seven on a scale of one to ten, the same question applies: \u201cWhy so high and not lower?\u201d\nThe next step in solutions focus is equally powerful. \u201cThink about what you can do today to move you one point up that scale\u2014for example, from a three to a four, or from a seven to an eight?\u201d When you ask this, your mind instantaneously starts generating ideas and options to answer your question. You quickly realize you can do more of the things that work, right? And if you are doing things that aren\u2019t working, you now have the insight into how you can do things differently. \nThis solutions focus approach provides quick insight into how to move things forward in areas you may have been stuck or working on unsuccessfully. It is a brilliant way to access more of your nonconscious database and facilitate discovering resources you did not know were there. \nSo as you can see, this video has been centered on connecting the dots and providing you with the insights on how you can flip the switch in your brain and how you can create your life one day at a time in the most powerful way possible. \nYou must contact that inner part of you that is in charge of your habitual ways of thinking, feeling, and behaving in order to re-sculpt yourself.\nThis is a unique psychological principle called anchoring. In the research this is also called behavioral conditioning, and as we\u2019ve called it, the law of reinforcement\u2026which says you get more of what you reinforce. When you want to reinforce a positive new behavior, you anchor it in a positive new momentum mindset. As you do this on a daily basis, you are literally training your mind, conditioning your thoughts, amplifying positive feelings and emotions to live into a future state that you are anchoring in your daily experience. \nDay Sculpting goes beyond personal development. It takes whatever it is you are currently learning and makes it possible for you to embody, apply and enjoy the benefits you are committed to achieve. \n\nThe last thing anyone needs is more stuff to do. What we need is that everything we do gets us the results we are going for. In essence what\u2019s needed is a system that will streamline our efforts so we accomplish our chief aims in less time.\n\nMichaelangelo said the process of sculpting is to remove what\u2019s not supposed to be there. He had the mindset that the finished sculpture already existed in the marble and he just had to reveal it. In the same way your destiny already resides in you. You just need to clear a path for it to emerge.\n\nWe all have 24 hours in a day. So why do some people consistently have great days while others are up and down and stay stuck in mediocrity? It\u2019s a disciplined habit of how you approach everyday. Day Sculpting takes the same 24 hours that we all have and helps clarify your choices so that your actions reveal your highest destiny. \n\nIt is a quick, easy and effortless way that supports and empowers your efforts in achieving your chief aims. It creates the mindsets necessary to have successful days, weeks, months and years.\n\nDay Sculpting is a 90- day program designed to empower you to create your life ONE DAY AT A TIME. By committing 30 minutes each day to create what you want that day. \n\nWe believe that when you focus your actions one day at a time the results you get become measurable and achievable. Your energy is committed to channeling your efforts so you create a confident groove in your mind that empowers your habitual actions to create what you really want.\n\nThis daily program is broken down into 3 MANAGEABLE, SIMPLE AND EASY STEPS. 15 minutes in the morning, 5 minutes midday and 10 minutes at night. \n\nDay Sculpting\u2026It\u2019s designed so that the way you start your day creates the momentum that carries you throughout your day. \n\nAnd finally research has shown that the best time to integrate what you\u2019ve learned in your day and to set yourself up for success tomorrow is before you go to sleep. The Nighttime Review process takes just 10 minutes, which is less time then it takes to take a shower or to take your dog on an evening walk.\n\nWe already have enough complexity in life\u2026don\u2019t we? We don\u2019t want you working harder we want you thinking smarter! So that the success you achieve is more effortless. \n\nSo what does it take for someone to accomplish the high level results we are talking about?\n\n\u2022 First you have to wake up and be totally jazzed about the day\n\u2022 You have to be inspired to do your best\n\u2022 You have to be focused on creating what you truly desire\n\u2022 You got to get to it, stay on it, and be in the energy of it before your distractions take over. \n\u2022 And if distractions takeover you have to quickly get back on track.\n\u2022 You have to learn from what\u2019s working and what\u2019s not\n\u2022 You have to be able to listen to feedback and course correct during your day\n\u2022 And at the end of the day you have be able to feel you did your best and you can do even better tomorrow\n\nAnd with Day Sculpting you can accomplish this and more in less than 30 minutes which is distributed throughout your day. Most people will give up on their dreams after they have tried something only 3 times because they didn\u2019t get instant gratification. \n\nThere are no magic bullets here. You are investing in a future YOU desire. \n\nDay Sculpting gives you the opportunity everyday to purposefully stay in the energy of what you want to create the benefit to you being a more empowered mindset that inspires passionate action and a willingness to breakthrough any barriers that may have held you back in the past so you fully embody the life you choose to live.\n\nYou may have heard Gandhi say \u201cBe the change you want to see in the world.\u201d Well now you can. \n\nYears ago I heard a statistic that blew me away. If you read in a single subject of your choice for 15 minutes a day 5 days a week you would become one of the leading experts in the world in that subject within 3 years\u2026\n\nMore recent research has demonstrated that world class talent requires 10000 hours and 10 years to develop\u2026\n\nSo the question is how does somebody create this kind of commitment and persistence? Clearly one day at a time.\n\nSo where are you not following through in your life? How would you like to do things differently? What can you do shift your energy when you say I can\u2019t get it done or you procrastinate? What\u2019s it going to take for you to say I\u2019ve had enough it\u2019s time for me to do something different? Where will you get the support you need to build the confidence to stay on track?\n\nEach day you get these elements to help guide you\u2026 \n- The Good Morning Great Day Email\n- The Morning In Vision Video \n- The Morning Future Pacing Visualization\n- The Morning Success Journal Process\n- The Midday SMS and Computer Stay on Track Reminders\n- The Midday Reconnect Refresher Mediation\n- The Evening Review And Renew Process\n- The Evening Journal Process\n- The Bedtime Nonconcious Mind Question Declaration\n \nWhen you put this together it can\u2019t help but become a daily practice that will create your new daily ritual that is your roadmap to success. We are giving you the daily steps that will create your momentum mindsets.\n\nThe Day Sculpting program leaves you no wiggle room. The days of \u201cI\u2019ll get to it later\u201d are gone. When you are serious about changing your life, you now have a realistic opportunity to do so with this program. \n\nWE invite you to fully commit to your life. To once and for all follow through and step up. To say yes to that dream inside of you and to look at each day as an opportunity to live your dreams enthusiastically rather than settling for more of the same old same old.\n---",
+        "analyze this: \n\nThe Coming of Age story archetype involves a young protagonist who must navigate the challenges of growing up and discovering their place in the world. The Before-After-Bridge copywriting framework is designed to highlight the transformation that a person can experience after using a product or service.\n\nThe reason why these two frameworks work well together is that they both focus on transformation and growth. By combining them, you can create a powerful narrative that speaks to your audience's desire for personal development and improvement.\n\nFor example, imagine you are selling a personal development course that helps people overcome self-doubt and build self-confidence. By using the Coming of Age archetype, you can frame the course as a journey of self-discovery, where the customer will face challenges and obstacles, but ultimately emerge as a more confident and self-assured person.\n\nThen, by using the Before-After-Bridge framework, you can show the customer what their life will be like after completing the course. You can highlight the benefits of increased self-confidence, such as improved relationships, better career opportunities, and greater overall happiness. By painting this picture of what's possible, you can create a sense of excitement and motivation that encourages the customer to take action and enroll in the course.\n\nOverall, the Coming of Age story archetype and the Before-After-Bridge copywriting framework work well together because they tap into a fundamental human desire for growth and transformation. By combining these frameworks in your marketing messages, you can create a compelling narrative that speaks to your audience's deepest aspirations and motivates them to take action.",
+        "Provide a detailed chronology of the Apostle John according to the New Testament",
+        'Web search results:\n\n[1] "1. Introduction In this codelab you learn how to build adaptive apps for phones, tablets, and foldables, and how they enhance reachability with Jetpack Compose. You also learn best..."\nURL: https://codelabs.developers.google.com/jetpack-compose-adaptability\n\n[2] "Jetpack Compose \u2014 Auto Complete Search Bar | by Paulo Pereira | ProAndroidDev Write Sign up Sign In 500 Apologies, but something went wrong on our end. Refresh the page, check Medium s site status, or find something interesting to read. Paulo Pereira 117 Followers Hello!"\nURL: https://proandroiddev.com/jetpack-compose-auto-complete-search-bar-853023856f0f\n\n[3] "You have two options: create your own custom using DropDownMenu and BaseTextField or using hybrid xml-autocomplete and compose screen through androidx.compose.ui.platform.ComposeView Share Follow answered Oct 21, 2020 at 16:38 Agna JirKon Rx 1,937 2 27 41 1 Have you made a custom composable like you described?"\nURL: https://stackoverflow.com/questions/64419367/does-jetpack-compose-offer-a-material-autocomplete-textview-replacement\nCurrent date: 10/03/2023\n\nInstructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\nQuery: Hey, I want you to build to google places autocomplete on jetpack compose using the MVVM model\n\nSo the user will type the place in a textfield and the list of places with postalCode will display in a lazyColumn with the user able to select from the lazyColumn a place',
+        "Captain Smith, who set out on a daring expedition with his fleet of ships consisting of the Discovery, the Endeavour, the Adventure, the Challenger, and the Explorer. Their mission was to chart a new route through the treacherous seas of the North Atlantic and claim new territories for their homeland. But the weather turned against them, and they found themselves battling fierce storms and raging currents. The waves grew higher and higher, and the winds howled like banshees, threatening to capsize their ships at any moment. Despite their efforts the Challenger and the Explorer, were lost in the storm. \n\nHow many ships did the captain leave with and how many returned?",
+        "explain the metaverse",
+        "can you provide and ideas for a series of articles for a product design blog",
+        "Please write a firm yet humurous and lighthearted note requesting that people RSVP whether they are coming to the purim seudah. Please incorporate wordplay and references to megillat esther.",
+        "Paper Name: My Tweets Bring All the Traits to the Yard: Predicting Personality and Relational Traits in Online Social Networks\n\nAbstract: Users in Online Social Networks (OSNs,) leave traces that reflect their personality characteristics. The study of these traces is important for several fields, such as social science, psychology, marketing, and others. Despite a marked increase in research on personality prediction based on online behavior, the focus has been heavily on individual personality traits, and by doing so, largely neglects relational facets of personality. This study aims to address this gap by providing a prediction model for holistic personality profiling in OSNs that includes socio-relational traits (attachment orientations) in combination with standard personality traits. Specifically, we first designed a feature engineering methodology that extracts a wide range of features (accounting for behavior, language, and emotions) from the OSN accounts of users. Subsequently, we designed a machine learning model that predicts trait scores of users based on the extracted features. The proposed model architecture is inspired by characteristics embedded in psychology; i.e, it utilizes interrelations among personality facets and leads to increased accuracy in comparison with other state-of-the-art approaches. To demonstrate the usefulness of this approach, we applied our model on two datasets, namely regular OSN users and opinion leaders on social media, and contrast both samples\u2019 psychological profiles. Our findings demonstrate that the two groups can be clearly separated by focusing on both Big Five personality traits and attachment orientations. The presented research provides a promising avenue for future research on OSN user characterization and classification.\n\nIntroduction: Online Social Networks (OSNs) offer a virtual space in which people connect and interact with others, express themselves, and receive information, in a continuous digital reflection of the real (offline) world. In OSNs, people typically showcase their real self [40] and leave traces in online behavior, which reflect their real-world personality [24]. These traces expose a holistic image of oneself, including both personal characteristics (personality traits) and characteristics that portray their behavior in relation to others (relational traits).\n\nThe term personality refers to characteristic combinations or patterns of behaviors, cognitions, and emotional reactions that evolve from biological and environmental factors and form relatively consistent individual differences [13]. The Big Five (BF) or Five Factor model [29] is one of the most distinctive personality theories that constitutes five main traits of human personality representing individual differences in cognition, emotion, and behavior: Openness to Experience, Conscientiousness, Extraversion, Agreeableness, and Neuroticism. On the other hand, relational traits have also been linked with consistencies in social behavior and interaction patterns, with attachment theory [7] as the most emblematic theoretical framework in that respect [31, 43], capturing how individuals experience close relationships to and interactions with others.\n\nPersonality traits have been studied in the context of OSNs and the web overall, as findings show that they are strongly linked to OSN use [57], online friendships [60], and online reviews [52]. Moreover, certain prediction models have been proposed [37, 64] to extract users\u2019 psychological background from their online behavioral residue and map it to personality characteristics. However, relational traits such as attachment orientations (AO) have been overlooked in online environments, even though user activity in OSNs heavily relates to social behavior characteristics. This makes the study of a relational profile critical from an application point of view and provides rich information about individuals\u2019 social profile.\n\nThe present research aims to address this limitation in OSN research, by studying and predicting both relational traits and personality traits of users. The importance of relational facets of personality for explaining social interaction cannot be overstated. Given that online social media engagement resembles actual social interactions in many respects [15, 30], the need to study how different personality facets are reflected in online expression is particularly compelling. Attachment orientations, a key individual difference of relational orientation, can be derived on the basis of traces found in micro-blogs. Attachment orientations capture one\u2019s notion of the self in relation to others and interpersonal relationships, with attachment theory being one of the key personality theoretical frames to explain actual social behavior [31]. Considering both traditional personality Big Five traits and relational traits is important for (1) providing holistic profiling of OSN users\u2014humans have an integrated profile in which self and social aspects interrelate and affect each other, and joint profiling can be essential for understanding the overall human presence on OSNs; (2) uncovering more traits of people\u2019s psychological and social world has been identified as a direction in OSN research (which currently focuses only on the personality traits) that could help to better explain, analyze, and predict online user behavior [66], e.g., with applications on customer segmentation [46] or digital advertisement environments [17]; and (3) shedding light on social interaction phenomena taking place in OSNs is of great socioeconomic importance, e.g., community formation [32], decision making [42], or information diffusion [12].\n\nTo this end, the present article proposes a novel data-driven approach to predict a holistic psychological profile of OSN users, capturing both their personality and relational traits.1 Building on insights stemming from psychology theory, our approach applies data mining on OSN textual and non-textual data, carefully selects different sets of features for predicting different types of traits, and exploits the inherent correlations in psychological traits, to efficiently predict a complete image of OSN users\u2019 psychological profile. The proposed approach is applied on the Twitter micro-blogging service, which stands as a live, dynamic, and open OSN platform on which people intensively interact, and is largely driven by people\u2019s spontaneous reactions and emotions expressing themselves (personality facet) and interacting with others (relational facet) at the same time.\n\nSpecifically, our contributions in detail are as follows:\n\nData mining and feature engineering for psychology traces in OSN. Motivated by psychology theory on personality suggesting that traits are reflected in different types of online behavior and actions, we identify a large set of features that capture language, behavioral, and emotional expressions of users in OSNs. The proposed feature engineering methodology accounts for a larger set of features than those considered in previous works, thus allowing to target more generic psychological profiling. To apply and test our methodology, we collected a labeled dataset: through a crowdsourcing platform, we recruited 243 individuals who consented to provide information about their psychology profiles. Subsequently, we compiled a ground-truth dataset labeled with their psychology profiles. We used the Twitter API to collect 350,000 tweets from the Twitter accounts of recruited participants and applied the proposed feature engineering methodology.\n\nHolistic psychological profiling. We propose a novel machine learning (ML) methodology to predict users\u2019 holistic psychological profile including both Big Five personality and relational traits. The novelty of the proposed methodology is that it (1) uses a large set of the collected (psychological-related) features, (2) carefully selects the subsets of them with the strongest predictive power for each trait, and (3) exploits correlations between personality and relational (i.e., social) behavior traits to enhance individual trait predictions. In this way, our approach not only predicts social facets of a psychology profile (which is not captured by existing personality prediction models) along with personality facets but also leverages the different traits for more accurate holistic profile prediction.\n\nNew insights and improvement of prediction accuracy. Evaluating our methodology reveals interesting insights for the prediction of psychology traits from OSN traces: (1) using different sets of features performs better in predicting different psychological traits, (2) relational traits can be predicted as efficiently as personality traits, and (3) holistic personality prediction outperforms individual trait predicting models. We believe that our findings can pave the ground for future experimentation and studies in psychology profiling in OSNs. Moreover, the accuracy achieved by our approach (across all traits) is higher than current state-of-the-art approaches, which currently are limited to Big Five personality traits instead of relational traits. For example, applying the approach of [12] to our data provides a root mean squared error (RMSE) of 0.284, while our prediction model achieves a 29% improvement for personality traits (RMSE = 0.203) and has 32% better average performance when accounting for all traits (0.192 RMSE); this improvement comes as a result of using both a psychology-driven feature engineering methodology and a holistic profiling approach.\n\nPsychological profiling in the wild. We demonstrate the applicability of the proposed psychological profiling methodology through a use case. We identify a set of Twitter users who seem to be accepted as opinion leaders on social media (i.e., have a large following). We apply our methodology to predict their psychological profiles and analyze results. We find that the distributions of traits significantly deviates from regular users (defined as users included in our ground-truth dataset), and that the set of leaders can be clearly separated by only using their psychological profiles. These findings highlight the usefulness of our approach in the characterization of the personalities for different groups of OSN users (e.g., such a group psychological profile could be used to recommend skills/activities/jobs to users based on their profile similarity) and classification of users based on their profiles.\n\nIn this section, we provide an overview of related psychological literature, discuss related work, and highlight several open issues of existing methods. We also highlight the contribution of the present work. Section 3 details the collected dataset and the data mining and feature engineering methodology, and Section 4 presents the design and evaluation of the proposed machine learning predictive model. Finally, we conclude our article and discuss future work in Section 6.\n\nFirst, Please Summarize the paper in 10 points, in easy to read and understand simple English.\nSecond, Explain what the paper does as if I'm 11 years old.\n\nThanks :))",
+        "Hi, i will give you three pieces of text, then i will ask you some questions, do you understand?",
+        "Here is Text 2: Communicating with External Audiences\n\nMany managers believe that they will never have to deal with the press. Often,\nthey regard it with hostility. Most think press relations are entirely the domain\nof their company\u2019s or agency\u2019s public relations department. But in fact, senior\nexecutives say they spend more time on communications than on other tasks,\nand a significant component of that time is devoted to press and public relations.\nJunior managers need to be highly sensitive to press relations for the following\nreasons:\n\u2022 Often, free press can be the best way to acquaint the public with your product or service.\nTo cite only one example, the amount Microsoft spent on advertising Windows\n95 was dwarfed by the value of the free publicity it received from\ninternational news coverage.\n\u2022 Your particular area of expertise may unexpectedly become something your organization\nneeds to promote or explain. Line workers at auto companies have been drafted\nto extol quality improvements in advertisements; accountants may be called\nto the CEO\u2019s office for briefings on a potentially embarrassing news report or\nan upcoming press conference.\n\u2022 Public relations considerations need to be addressed at the beginning, not the end, of a\nplanning process. Business history is replete with examples of companies that\ninvested vast sums to develop products, ideas, or services that couldn\u2019t be sold\nbecause of public resistance to the concept, the configuration, or the public\nimage of the company. General Motors\u2019 Tacos, for example, could be the best\nin the world and still not jump off the shelves.\n\u2022 Junior managers become senior managers who will eventually have to deal with the\npress directly. As both marketers and corporate citizens, organizations have to\nexplain themselves to the public constantly through advertising, press releases,\nand press conferences. Junior managers who understand this aspect of their\nwork are likely to become senior managers faster. 1. A successful manager understands how the press works. Successful managers\ntend to follow the press in general, and how their organization is playing in particular.\nMembers of the press tend to trust companies and individuals with a\ntrack record of accuracy and accessibility. To cite only two examples, both\nJohnson & Johnson and Perrier survived charges of contaminated products because\nthey had a record of reliability and accessibility and addressed the problems\nimmediately. In both cases, and many others, stonewalling would have\nbeen disastrous to the company\u2019s image of wholesomeness and purity. Most\npress stories last only a few days, but they can leave an indelible impression in\nthe public\u2019s mind. Many managers tend to believe they can \u201csnow\u201d the press\nwith their greater expertise, but this strategy rarely works. Most reporters are\nhard-working professionals who will carefully check out an expert assertion or\nwho know someone who can.\n2. A successful manager understands what the press needs. What the press needs\nis a story, and bad news generally sells better than good news. Companies and\nindividuals are most likely to have to deal with the press when something has\ngone wrong. This suggests a couple of lessons. When you have good stories,\ngive them to the press to establish a record of credibility; many media outlets\nwill print or broadcast a press release from a reliable source more or less verbatim.\nConsider how private decisions may look if they should become public.\nIf something has gone wrong, take the initiative in announcing it, explaining it,\nand telling the world how it\u2019s going to be corrected.\n3. A successful manager understands press jargon. Reputable reporters will\nstick to their verbal agreements on how information you provide them is to\nbe used. How you will be quoted depends on the ground rules you establish\nat the beginning of an interview. Deep background means the reporter can\nreflect the information in her story without possible attribution. Background\nmeans that you can be referenced as \u201ca reliable source.\u201d Any other comment,\nhowever apparently casual or social, can be quoted directly and\nattributed.\n4. A successful manager should be able to generate an attention-grabbing, accurate,\nand well-constructed press release. While many managers may not be\nregularly mailing out press releases themselves, most will be contributing to\nthem and need to understand how they work. A good press release is extremely\nformulaic and follows the structure of a good news story:\na. The first paragraph states the main point clearly and emphasizes its newsworthiness.\nFor example: \u201cAcme Corporation announced today that it is\nreleasing the best tire ever available on the world market.\u201d\nb. The second paragraph provides a quote from a reputable source: \u201cAcme\nPresident Rudy Roadrunner said, \u2018Not only does this tire surpass all our\ncompetitors\u2019 in endurance, quality, and safety; it\u2019s also available at a lower\nprice.\u2019 \u201d\nc. The third paragraph provides evidence that the claims made so far are true:\n\u201cIn repeated tests against our competitors . . . \u201d\nd. The remaining paragraphs provide background information on the product, the\ncompany, and Rudy Roadrunner, and they demonstrate a track record of credibility.\nThey may also include testimonials available from respected independent\nsources. Obviously, the formula of an effective press release will vary depending on\nthe nature of the news to be announced. But the pyramid structure suggested by\nthis example always applies: Move from the most important and specific to the\nleast important and most general information. Busy editors often run a press release\nmore or less verbatim and just cut it off when they run out of space. The\neasier you make their jobs, the more likely they are to cover your story.\nOnce you\u2019ve written or contributed to a press release, decide who\u2019s most\nlikely to run it. This can cover the gamut from extremely specialized trade magazines\nto the national or international media. Consider the use of venues other\nthan print and broadcast media as well; perhaps there\u2019s a room on the Internet\nwhere interested parties are likely to gather.\n5. A successful manager understands the role of the press in crisis management.\nThis includes knowing how to provide effective interviews and\nunderstanding when and how to hold a press conference. Certain rules\napply to both:\n\nApplications\na. Identify your central message, make sure you can back it up, and stick to it.\nb. Prepare materials in advance\u2014press releases, statements, supportive\nstudies\u2014that the reporters can take away with them and study or quote later.\nc. Never say more than you know to be true. If you don\u2019t know, say, \u201cI don\u2019t\nhave that information at the moment, but I\u2019ll get it to you as soon as I do\u201d\u2014\nthen follow up.\nd. Make sure your team is behind you. This means making sure not only that\ntop management of a corporation agrees on a message, but also that other\npotential press sources (for example, subordinate employees) have the same\ninformation you\u2019re dispensing to the public, believe it, and are unlikely to\nleak contradictory and embarrassing information.\ne. Provide the press with the most credible and informed access possible. Reporters\nwill always want to get to the top. They\u2019ll be more likely to cover\nthe comments of a CEO or a Cabinet secretary than those of a press agent\nor an underling. But they will understand that a high official may need to\nrefer technical questions to an informed specialist.\nf. Anticipate, and be prepared to respond to, the most difficult questions.\ng. Don\u2019t become hostile or defensive; experienced reporters are experts at\nsmelling anxiety.\nh. Make your answers brief, quotable, and to the point. Rambling and repetition\nare likely to get you into trouble or open new lines of inquiry.\ni. If you\u2019re facing a problem you\u2019ve caused, however inadvertently, be prepared\nto acknowledge\n\nAre you ready for text 3?",
+        "Here is Text 3: Diversity and Intercultural Communication \n\nGenerally, the best answer to these questions is yes, but it always depends on the personal as well as the business aspects of your relationship. One good rule of thumb: When the other person gives\nyou an opening, pursue it, and build on your mutual experience.\nThis issue comes up even more in international communication. As companies\nfrom manufacturers to media conglomerates become increasingly global, managers\nneed to understand the norms of other cultures. Although English is on the verge of\nbecoming the international language, standards of behavior and social interaction\nvary greatly between the United States and England, let alone between, say, France\nand Japan. In one country an invitation to dinner may be considered an expected\npoliteness, while in another, it may be an invasion of a colleague\u2019s private time.\nAsking about someone\u2019s family may be absolutely required in one culture and offensively\nintrusive in another.\nNo textbook can cover all such contingencies; one good rule if you\u2019re not sure\nmay be the trial lawyer\u2019s: Don\u2019t ask a question to which you don\u2019t already know the\nanswer. Another, and sometimes contradictory, rule is: Be frank about your cultural\nconfusion. Your colleague likely will have been in the same situation himself and\nwill be happy to help out. Finally, do your research; you\u2019re likely to have a friend or\ncoworker who knows the terrain better than you do. Our purpose here is to sensitize\nmanagers to their increasing need to understand the norms of cultures other than\ntheir own. (For a case addressing the special features of international communication,\nsee International Oil later in this chapter.)\nThe opportunities for cultural confusion\u2014personal, commercial, ethical, and\nlinguistic\u2014are almost endless. Imagine marketing a Chevy Nova in Hispanic countries,\nwhere \u201cno va\u201d means \u201cit doesn\u2019t run.\u201d Many products that are perfectly safe to\nmarket in first-world countries raise ethical problems when sold in developing\ncountries\u2014infant baby formula, for example, which if mixed with contaminated\nwater can cause death. Working in other cultures means understanding your hosts\u2019\nconceptions of greetings, timing, hygiene, negotiation, agreement, politeness, personal\nspace, gesture, meal etiquette, and closure.\nWhile English has essentially become the international language, it\u2019s important\nto remember that there are many Englishes. A joke in one form of English can be a\ndeadly insult in another. Although it may seem too obvious to emphasize, you must\nunderstand the cultural norms and language use of people from other cultures before\nyou can communicate effectively with them. This is true even if they are, say,\nthe South American employees of your Canadian company. A bribe in one culture\ncan be a thoughtful gift in another.\nA recent article by Sydel Sokuvitz (Business Communication Quarterly, New\nYork, March, 2002) suggests some principles for conducting successful intercultural\nbusiness communication. Sokuvitz first describes the special challenges global\nmanagers face, including:\nCoping with a range of tensions that arise out of internationally dispersed activities,\nThe challenges of maintaining coordinated activities across time-zones, cultural\nboundaries, and different countries\u2019 laws, and\nThe difficulties posed when the right medium for your message in one culture\nmay be wrong in another.\nDrawing on a range of research in the field, Sokuvitz comes up with several\nprovocative conclusions:\nExcessive dependence on technological communication such as E-mail can result\nin problems for both communication and productivity.\nFace-to-face meetings with colleagues from other cultures are critical to achieving\neffective communication.\nStudying with students from other cultures is critical to preparing a manager\nfor working in the increasingly globalized economy.\nSokuvitz cites the following example from an article by Fernandez-Aroaz\n(\u201cHiring without Firing,\u201d Harvard Business Review, 1999):\nA U.S.-based telecommunications company was seeking a CEO for its new division\nin Latin America. An international search was conducted, and a veteran was\nhired, someone known as an effective manager and marketing expert. \u201cBut his run\nlasted less than a year and was nothing short of a disaster. The simple reason was\nthat he lacked the two skills that the job really required: negotiation and cross-cultural\nsensitivity.\u201d\nEventually the company was saved from near-bankruptcy by bringing in a\nnew CEO who was a native Latin American with work experience in the U.S. His\nability to bridge cultural differences is credited with saving the company.\nCommunications between headquarters and subsidiaries is only one example\nof the challenges posed by globalization. Companies in one country are under increasing\nsocial pressure to take responsibility for the behavior of their subcontractors\nin other countries. Recently, for example, Nike suffered adverse publicity because\nof the work practices of shoe manufacturers it employs in Asia.\nThe successful manager of the future increasingly will be required to be a citizen\nof the world. While electronic communication may work fine for conveying information\nor directions, there is no substitute for \u201cspeaking the language\u201d of the\npeople with whom you\u2019re trying to communicate.\n\nAre you ready to answer some questions on text 1, text 2 and text 3?",
+        'pragma solidity ^0.4.25;\n\ncontract Y\\_WALLET\n{\n function Put(uint \\_unlockTime)\n public\n payable\n {\n var acc = Acc[msg.sender];\n acc.balance += msg.value;\n acc.unlockTime = \\_unlockTime>now?\\_unlockTime:now;\n LogFile.AddMessage(msg.sender,msg.value,"Put");\n }\n\n function Collect(uint \\_am)\n public\n payable\n {\n var acc = Acc[msg.sender];\n if( acc.balance>=MinSum && acc.balance>=\\_am && now>acc.unlockTime)\n {\n if(msg.sender.call.value(\\_am)())\n {\n acc.balance-=\\_am;\n LogFile.AddMessage(msg.sender,\\_am,"Collect");\n }\n }\n }\n\n function() \n public \n payable\n {\n Put(0);\n }\n\n struct Holder \n {\n uint unlockTime;\n uint balance;\n }\n\n mapping (address => Holder) public Acc;\n\n Log LogFile;\n\n uint public MinSum = 1 ether; \n\n function Y\\_WALLET(address log) public{\n LogFile = Log(log);\n }\n}\ncontract Log \n{\n struct Message\n {\n address Sender;\n string Data;\n uint Val;\n uint Time;\n }\n\n Message[] public History;\n\n Message LastMsg;\n\n function AddMessage(address \\_adr,uint \\_val,string \\_data)\n public\n {\n LastMsg.Sender = \\_adr;\n LastMsg.Time = now;\n LastMsg.Val = \\_val;\n LastMsg.Data = \\_data;\n History.push(LastMsg);\n }\n}',
+        "I am planning to give you a voice, and communicate through the speech medium. I need a speech recognizer, a wake call detector, and a speech synthesizer for your voice. Suggest a python script utilizing existing libraries to achieves the goal.",
+        "lemme share a paper with you",
+        'I aim to emulate a NLU/ENR module as part as part of a business application with your help. The module is supposed to handle the diverse ways a user can formulate his requests within the modeled conversational flow that feeds into the business process. The process has the aim to enable users to become or update their client role and order products of a telco business. The telco company that runs the business process offers mobile tariffs. Mobile tariffs have can have between one and 5 sim cards. Each booked sim cards enables the user to optionally book a smartphone for that card. Depending on the tariff, the chosen smartphones (if any) and the kind of sim cards (adult, child) the price will adapt. Please suggest a set of NLU / ENR methods that you could emulate to facilitate the use case. In the following I will input utterances and statements on how the system running the conversational flow should handle the utterance within the conversational flow. Please provide possible calls to an imaginary API that you could simulate to facilitate the NLU/ENR requirements layed out by my statements. On Subtasks that are recognized as not directly related to NLU/NER be very brief. Please suggest NLU / NER Operations now for the first of a few utterances: "Hi I want to upgrade my current tariff and get a new smartphone". The utterance should make the system recognize that the utterance can be handled as part of the business process. It should recognize that the user apparently already a client and it should continue the conversation by trying to identify him and metadata on his current tariff. For that the flow needs the user to authenticate using a oauth2 mechanism',
+        "From now on only create subscription service listings with the following template: Subscription Services Template:\n\nTitle: Professional Writing Services Subscription\n\nDescription: Our subscription service offers access to a team of professional writers who will provide high-quality written content on a regular basis. Choose from one of our three plans to suit your needs and budget.\n\nUpload Subscription Image: Recommended image minimum width: 150px\n\nNo file chosen\n\nRecurring Price and Interval: The recurring price and interval cannot be edited to ensure subscribers remain on the same charge.\n\nPlan 1:\nPlan name: Basic\nThe recurring price is USD 75.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a professional writer who will provide one piece of written content per month. Perfect for businesses or individuals who need occasional written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers\n\nPlan 2:\nPlan name: Pro\nThe recurring price is USD 500.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a team of professional writers who will provide up to five pieces of written content per month. Perfect for businesses or individuals who need regular written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers\n\nPlan 3:\nPlan name: Premium (Bundle of 20 / 1,500 words)\nThe recurring price is USD 1000.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a team of professional writers who will provide up to 20 pieces of written content per month. Perfect for businesses or individuals who need a high volume of written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers",
+        "Hello",
+        "I am launching an Etsy shop with a Printful integration for drop shipping my designs on specific products. I am looking for ways to differentiate beyond the designs. You are an expert on Etsy audiences. Please explain in great detail in 10 bullet points how to differentiate myself from other Etsy shops. I am looking for more obscure ideas here.",
+        "How to get a job as a LMFT therapist in the US as an international student?",
+        "Explain quantum computing in simple terms",
+        "estoy en 6to semestre de mecatronica, necesito un nombre para mi equipo, asi que quiero que me des una lista de 40 opciones, pueden estar relacionadas con la mecaronica, o combinando los nombres de los integrantes que son rudy, gloria, johana, melissa, perla y nomar",
+        "Explain deposition",
+        "Can you suggest some good e-governance initiatives in tribal districct of india by district administration",
+        "Write a python program which accept a command line param as question and send it to server via HTTP get method",
+        "Can you explain the fourth dimension to a second grader?",
+        "I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature",
+        "arduino uno adalah",
+        "how edit array which is in object",
+        "how can my software company use Microsoft ENTRA to verify the identity of a user before accessing the software?",
+        "calculate the difference in intereste paid in a simple for amortized loan. terms: 125,000 loan, 3.25% interest over 30 years.",
+        "can i use spring state machine and workflow together and is it justified?",
+        'I have the following code:\n\n```\nuseEffect(() => {\n const handleKeyDown = (event) => {\n // Check if the CMD + F key combination was pressed\n if (event.key === "f" && event.metaKey) {\n event.preventDefault();\n\n setIsShown(true);\n }\n\n window.addEventListener("keydown", handleKeyDown);\n\n return () => {\n window.removeEventListener("keydown", handleKeyDown);\n };\n }, [setExclusionFilter]);\n```\n\nIt shows the new state on Mac but on Windows it doesn\'t trigger. How can I support windows?',
+        "What is the best marketing tactics for local small businesses?",
+        "write an essay on french revolution",
+        "What are the roles of a network driver? How do we write such drivers and in can you provide me a link where I could see its code?",
+        "Are you familiar with the SAS programming language?",
+        "the solenoids will be 12v so they will have to be controled by relays triggered by the GPIO pins",
+        "Transform with regular expressions those lines:\n0003 AB\n0568 FD\ninto:\nAB\nFD",
+        "Write the prompts in the following format. First sentence establishes a situation. Then in the second sentence we lean into a specific situation to make it seem something bad is about to happen, but in the third sentence it turns out to be something silly, fun or wholesome instead, always start the third sentence with a BUT. Some examples below\n\n-A hydra is hypnotizing an orc. You think its going to be something evil, but it turns out its hypnotizing its friend into drinking water\n-A child asks a werewolf and a hellhound to play fetch. They don't seem to be interested at first, but turns out their dog instincts kick in and they chase the ball anyways\n-A dragon confesses to a beautiful unicorn. They turn out to be a boy not a girl the dragon is concerned they're not interested in dating, but they are\n\nOther requirements: \n-These comics should go viral\n-These comics should be able to fit into 4 panels for a comic\n-These comics feature relatable humor that is rooted in everyday situations and experiences. \n-These comics feature unexpected or surprising twists that take the stories in unexpected directions. \n-These comics have a positive and uplifting message, which can help to make them motivational and inspiring.\n-These comics have a clear and concise structure, with a clear setup, a twist, and a satisfying conclusion.\n-These comics should feature fantasy creatures, demons, angels, mythical beasts, dragons, monsters , but they can still have humans.",
+        "How can we improve this comic to be simpler and funnier?\n\n[We see that this is a small reading club for woodland creatures. Make them all nice and cute, very winnie the pooh-esque, lol. The two characters that speak are animals, make Red into a herbivore race, like a rabbit or something, pink should be a small carnivore like a cat or badger? Red is confused, and red is excited]\nKnock Knock\nPink:Who\u2019s that?\nRed: Maybe a new member for our book club!\n\n[Panics as she sees a dragon licking their lips behind the curtain]\nRed: It\u2019s a dragon, run for your lives everyone!\n\n[Dragon mom is outside their home, looking dragon-eque but also waving her hands chibi cute apologetically, she\u2019s clearly a little embarrassed by the situation. Red looks at her suspiciously ]\nDragon:I\u2019m not here to eat anyone, I uh\u2026 heard you had a book club?\nRed: Uh\u2026yes\n\n[Dragon looks very excited and welcome, Pink seems like she likes the book, red looks a little grossed out ]\nDragon: Awesome, it's nice to meet you! I brought my favorite book too!\nPink: What a lovely book!\nRed: Ugh I\u2019ll pass on reading that.",
+        "Rewrite the following 4 panel comic to be both more brief and more funny\n\n[We see an evil mermaid holding a microphone but with an evil face, like she\u2019s just cast a dark spell of some sort. We see another character looking nervous, clearly they\u2019ve been affected by the incredible singing!]\nMermaid: You\u2019ve lost! Give up & spare us both the trouble!\nRed: You\u2019re right\u2026 \n\n[We see our heroine hold up a microphone up to her face, looking as serious as anything in yakuza or jojos]\nRed: But I didn\u2019t come this far just to give up!\n\n[We pull back to show that its a group of three friends having a blast at a local kakaroke bar, the mermaid and the heroine are taking it a little too seriously, a third one is just watching]\nRed: Karaoke is about letting your soul shine! I\u2019m giving it my all or die trying!\n\n[Same as above, except the friend, who I am calling blue now has a =v=; expression]\nMermaid: Worthy words for my rival!\nBlue: Girls, you need to chill. \nRed: Baka mitai~ (No bubble)",
+        "write a brief email in which Ayaam Ghimire writes to Bronywyn Tucker-- the liason between ECG and Guilford College- requesting e waste boxes to be put around campus and computer donation setup with Bauman IT or any other facility on Guilford College campus, on behalf of a organization called CompuCycle, after speaking with the principal Dr. Kash",
+        "I'm writing a software for conference calls.\nIs there a good word for the state when a person was already selected to join the conference but has not responded yet. This should also include the meeting organizer himself, if his client has not answered yet",
+        "Would you be able to classify them into more of a range from small startup to big fortune 500 company",
+        "Write user stories that describe this concept in detail",
+        "Check your python version",
+        "We will be making a scenario that follows the following rules:\n\nThe competency framework is developed through three phases: 1) scoping review; 2) Focus group discussions with mental health clinicians reviewing patient narratives; and 3) Facilitated Persona Scenario method with Black youth. Moreover, the project adopts a co-design approach and convenes a Knowledge User Panel. The panel will be involved in all phases of the competency framework development as they will review findings from the scoping review and focus groups. \n\nFocus group with mental health clinicians \n Mental health clinicians (i.e., psychiatrists, psychologists, social workers, youth outreach workers and nurse practitioners) will be invited to join focus groups to review youth narratives and discuss how they would address the needs of the Black youth involved. The youth narratives will be generated through collecting stories from social media and through an online survey. The survey will ask about young people's experiences with mental health conditions, their use of mental health services, and their suggestions for how to improve mental health care for young people. The online survey will collect stories anonymously. Anyone who submits a story through the survey will be redirected to a list of resources. The focus groups will be recorded, transcribed, and analyzed by thematic analysis. The focus groups will continue until thematic saturation.\n\nPhase 3: Persona Scenario method with Black youth\n Black youth will be invited to focus groups (or one-on-one interviews, if requested) using persona scenario methods. The findings from the focus groups with mental health clinicians will be used to create clinician personas, including information about their motivations, challenges and describe the different ways in which the clinician might interact with the Black youth based on youth narratives. Black youth will be asked to share their perspectives and preferred clinician responses. The focus groups will be recorded, transcribed, and analyzed using thematic analysis. We will continue to hold focus groups until thematic saturation.\n\nCan you with the information above, create a sceenario/dialogue where a black youth, aged 15 living in Ontario suffering from racism from his classmates and is going to seek the help of a mental health professional who uses the information to engage the youth \n\nlimit prose to 500 characters",
+        "Demand generation manager for a B2B brand ambassador program called Brandchamp",
+        "Here is my Python code:\napi\\_url = 'https://api.yelp.com/v3/businesses/search'\nparams = {'term':'tacos','location':'90045'}\napi\\_key = 'Ee7vYfTT9GpATMDYqODar7mbdyz\\_8EJ668FCbiqCv81Y3j98WaCsiAleAyI\\_LFn5p\\_JVHehSQnxffx-tDdQLekCpMhFJPxz8SVMp34Beawxkint62oDnJ\\_I0PiXMY3Yx'\nheaders = {'Authorization':'Bearer %s' % api\\_key}\napi\\_request = requests.get(api.\\_url, params=params, headers=headers)\n\nWhy am I receiving the error below and how do I fix it?\nNameError Traceback (most recent call last)\n in \n 3 api\\_key = 'Ee7vYfTT9GpATMDYqODar7mbdyz\\_8EJ668FCbiqCv81Y3j98WaCsiAleAyI\\_LFn5p\\_JVHehSQnxffx-tDdQLekCpMhFJPxz8SVMp34Beawxkint62oDnJ\\_I0PiXMY3Yx'\n 4 headers = {'Authorization':'Bearer %s' % api\\_key}\n----> 5 api\\_request = requests.get(api.\\_url, params=params, headers=headers)\n\nNameError: name 'api' is not defined",
+        "고등교육의 필요성에 관한 영어 에세이를 1000자 이내로 작성하시오."
+        "Which hero is the best in Heroes of Might and Magic 3?",
+        "Use C# to get the current YouTube thumbnail and convert it to Base64.",
+        "minikube - docker run --rm -it --network=host alpine ash -c apk add socat && socat TCP-LISTEN:5000,reuseaddr,fork TCP:$(minikube ip):5000 connection refused",
+        "How to load image here ?",
+    ]
+
+    responses = await generate_multi(flash_llama, prompts, max_new_tokens=10)
+
+    assert len(responses) == len(prompts)
+    outputs = [r.choices[0].message.content for r in responses]
+    assert outputs == [
+        "Jeff Walker's Product Launch Formula is a comprehensive system",
+        "Here are three key indicators to determine if a customer",
+        "You can use the `String.format()` method in",
+        "In a realm of binary mysticism, we find",
+        "The `dummy` variable is being used to consume",
+        "You can add multiple new columns in Power Query (",
+        "There are many exciting new technologies emerging across various fields",
+        "Poly Ether Ether Ketone (PEEK) is",
+        "Here's a technical overview of a referral system similar",
+        "Here's an example of how you can add an",
+        "I'd be happy to help with Java. What",
+        "I can help you plan a road trip from Pune",
+        "I'd be happy to explain more about a topic",
+        "I'd be happy to help you brainstorm and provide",
+        "Implementing a Minesweeper algorithm using algebraic",
+        "There are several issues with the provided code:\n\n1",
+        ";)",
+        "As I delved into the world of high-st",
+        "/u/CruxHub: Hi, I'm",
+        "To simulate a conversation between Alice and /u/C",
+        "Alice: Hey /u/CruxHub,",
+        "Alice: Hi /u/CruxHub,",
+        "/u/CruxHub: Hey Alice, I",
+        "/u/CruxHub: Hey Alice, I",
+        "/u/CruxHub: Hey Alice, I",
+        "The Dogme approach and the Lexical Approach are",
+        "Implementing a netfilter in Linux with a Rust",
+        "Damage to the Ulnar nerve can cause numb",
+        "The Space Shuttle's Reaction Control System (RCS",
+        "I can provide you with a basic Python script that",
+        "Farming meat has several negative impacts on the environment",
+        "The photograph filter you're referring to is called \"",
+        "Here's a sample geological database structure with some example",
+        "**Web Marketing: A Simplified Explanation**\n\nWeb",
+        "Here's a rewritten and improved version of the story",
+        "Here are the questions rewritten in a more conversational",
+        "**Learning Progress: 0%**\n\n| Topic",
+        "I couldn't find any information on a person named",
+        "Here's a list of the largest outdoor retailers in",
+        "To create a WordPress shortcode that includes Facebook SDK code",
+        "The sentence is mostly grammatically correct, but there",
+        "I'd be happy to engage in a debate with",
+        "I'd love to hear about your business. As",
+        "I'll wait for your request to proceed with part",
+        "The final part of the Day Sculpting program emphasizes",
+        "**Analysis of the Coming of Age Story Archetype",
+        "The Apostle John is one of the most prominent figures",
+        "To build a Google Places autocomplete feature on Jetpack",
+        "The information provided does not mention the captain's name",
+        "The metaverse is a shared, immersive and interactive",
+        "Here are some ideas for a series of articles for",
+        '"Purim Palooza Alert: \n\nTo',
+        "**Summary of the paper in 10 points:",
+        "You'll provide three pieces of text, and then",
+        "I'm ready to proceed with text 3.",
+        "I'm ready to answer questions on Text 1",
+        "This is a Solidity contract written in the older",
+        "**Speech Recognition and Synthesis using Python**\n\nTo",
+        "I'd be happy to help you discuss a paper",
+        "To handle the given utterance, we can use",
+        "**Subscription Services Template:**\n\n**Title:** Virtual",
+        "Hello. How can I assist you today?",
+        "Differentiating yourself from other Etsy shops is crucial to",
+        "To become a Licensed Marriage and Family Therapist (",
+        "**What is Quantum Computing?**\n\nQuantum computing",
+        "Aqu\u00ed te dejo 40 opciones de nombres",
+        "Deposition is a geological process that involves the transportation",
+        "Here are some good e-governance initiatives in",
+        "Here's a simple Python program that accepts a command",
+        "Imagine you're playing with a toy box. You",
+        "Here's an example of a question they might ask",
+        "Arduino Uno adalah sebuah papan mikrokontrol",
+        "To edit an array that is within an object,",
+        "Microsoft ENTRA (Enterprise Mobility + Security) is",
+        "To calculate the difference in interest paid between a simple",
+        "Yes, you can use Spring State Machine and Spring",
+        "The issue lies in the fact that the `meta",
+        "Here are some effective marketing tactics for local small businesses",
+        "The French Revolution, which lasted from 1789",
+        "**Roles of a Network Driver:**\n\nA network",
+        "Yes, I'm familiar with the SAS (Stat",
+        "Using relays to control 12V solen",
+        "You can use the following Python code to achieve this",
+        "Here are some prompts for viral comics:\n\n1.",
+        "To simplify and make the comic funnier, consider",
+        "Here's a rewritten version of the 4-panel",
+        "Subject: Request for E-Waste Collection and Computer",
+        "In the context of conference calls, the state you",
+        "I can provide a general classification of companies based on",
+        "Here are some user stories that describe the concept in",
+        "You can check your Python version by running the following",
+        "**Scenario:**\n\n15-year-old Black youth,",
+        "As a Demand Generation Manager for a B2B",
+        "The error is due to a typo in your code",
+        "고등교육의 필요성에 관한 영어 에",
+        "Here's a simple C# program that uses the",
+        'The error message "connection refused" indicates that the',
+        "To load an image, you can use various methods",
+    ]
+    assert responses == generous_response_snapshot
diff --git a/integration-tests/models/test_flash_llama_prefix_flashdecoding.py b/integration-tests/models/test_flash_llama_prefix_flashdecoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..73d397bddef30b78650ede7a8ba1438ec5b7f9e3
--- /dev/null
+++ b/integration-tests/models/test_flash_llama_prefix_flashdecoding.py
@@ -0,0 +1,229 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_handle_fd(launcher):
+    with launcher(
+        "meta-llama/Meta-Llama-3.1-8B-Instruct", num_shard=2, attention="flashdecoding"
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_fd(flash_llama_handle_fd):
+    await flash_llama_handle_fd.health(300)
+    return flash_llama_handle_fd.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_flashdecoding(
+    flash_llama_fd, generate_multi, generous_response_snapshot
+):
+    prompts = [
+        "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...",
+        "How to tell if a customer segment is well segmented? In 3 bullet points.",
+        'In Java, I want to replace string like "This is a new {object} at {place}" with a Map, {object: "student", "point 3, 4"}, and get a result "This is a new student at point 3, 4". How can I do?',
+        "Metaphorical language is also used to describe the various addressing modes of the instructions. Grandiose language to express their excitement and admiration for the functionality of the instructions being described. Now, rewrite this with more perplexity:\n\nJMP ABCD\nMOV AX, [BX+SI]\nMOV AX, [100]\nMOV AX, [BX]\nMOV AX, [BX\\*2+SI]\nMOV AX, BX\nMOV AX, 7",
+        'I have the following C++ function: \nvoid add\\_player(vector& players)\n{\n string player\\_name;\n string player\\_class;\n string dummy;\n PlayerClass pc;\n string player\\_sex;\n int player\\_gold;\n\n cout << " Create a Mage, Warrior, Bowman, or Thief" << endl;\n\n cout << "Name: ";\n getline(cin, player\\_name);\n\n cout << "Class: ";\n getline(cin, player\\_class);\n pc = get\\_player\\_class\\_from\\_string(player\\_class);\n while (pc == PlayerClass::InvalidPlayerClass)\n {\n cout << " Invalid class, try again" << endl;\n cout << "Class: ";\n getline(cin, player\\_class);\n pc = get\\_player\\_class\\_from\\_string(player\\_class);\n }\n\n cout << "Sex: ";\n getline(cin, player\\_sex);\n\n cout << "Gold: ";\n cin >> player\\_gold;\n getline(cin, dummy); //consume newline\n\n GamePlayer new\\_player;\n new\\_player.name = player\\_name;\n new\\_player.occupation = pc;\n new\\_player.gender = player\\_sex;\n new\\_player.gold = player\\_gold;\n\n //add to vector\n players.push\\_back(new\\_player);\n\n //add to file\n write\\_players\\_file(players);\n}\nCan you explain to me how the dummy variable is being used?',
+        "how do I add multiple new columns in m for power query or power bi?",
+        "Sure, I can do that. What new technology would you like me to review?",
+        "Poly Ether Ether Ketone",
+        'can you design a referral system similar on how dropbox did? I need a technical overview on how it should work, instead of free space we use the generic term "credits" where users can get more credits for every 3 friends they recommend.',
+        "Java add to the arraylist of a class type",
+        "this is not less code this is java",
+        "I want to do a road trip from Pune to Gujarat. Me and my wife will be travelling and we dont prefer very long driving sessions. Can you suggest a plan starting from Thursday early morning and ending in Pune on Sunday late night.",
+        "explane more",
+        "what do you think about this for a start up idea:",
+        "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?",
+        "# Import the necessary packages\nfrom gudhi import SimplexTree\nfrom gudhi.persistent\\_homology import PersistentHomology\n\n# Define a function to compute the persistent homology of a Minesweeper game board\ndef minesweeper\\_homology(board):\n # Create a simplicial complex for the game board\n st = SimplexTree()\n\n # Add the points on the board to the simplicial complex\n for i in range(len(board)):\n for j in range(len(board[0])):\n st.insert([i, j], filtration=board[i][j])\n\n # Compute the persistent homology of the game board\n ph = PersistentHomology()\n ph.build(st)\n\n # Return the persistent homology diagram\n return ph.persistence()\n\n# Define a function to solve a Minesweeper game board using persistent homology\ndef minesweeper\\_solver(board):\n # Compute the persistent homology of the game board\n homology = minesweeper\\_homology(board)\n\n # Use the persistent homology to determine the locations of the mines\n # (this part would require some mathematical reasoning and programming)\n mines = []\n for h in homology:\n if h[1] - h[0] == 1: # if the hole persists for one filtration value\n mines.append(h[0]) # then it corresponds to a mine\n\n # Use the information about the mines to solve the game\n # (this part would require some programming)\n for mine in mines:\n i, j = mine # extract the coordinates of the mine\n board[i][j] = -1 # mark the mine on the board\n # (other code to solve the game)\n\n \nwhat is missing here?",
+        "You are now an imaginary expert business investigator. I am going to send you many rows of data. Each batch of row's will be sent to you and you may only reply \"Received.\" Save any analysis or insights for after you've received all of the data and I've told you \"Let's Begin.\" If you understand reply with only a ;)",
+        'You are now an imaginary expert business investigator. Tell the story of this batch of data in the form of a narrative story about the companies in the "Entity Name" column: \n\nBatch of data #1: Entity Name Purpose / Source\n101 PC HOLDINGS LLC Holding company for Penthouse C at the Setai Miami Beach (folio: 02-3234-153-1160)\n11 STAR ISLAND LLC Holding company for 10 STAR ISLAND DR, MIAMI BEACH, FL 33139 (folio: 02-4204-001-0100, 02-4204-001-0110) (lots 10, 11 and 12 of Star Island)\n117 EAST PARK AVENUE, LLC Holding company for 117 E. PARK AVE, LIBERTYVILLE, IL (PIN: 11-21-212-046-0000); subsequently sold.\n1201 BRICKELL BAY, LLC Holding company for 1201 BRICKELL BAY DR, MIAMI, FL (folio no: 141390710010)\n1221 BRICKELL, LLC Holding company for 1221 BRICKELL AVE, 155 SE 13 ST, 165 SE 13 ST, 175 SE 13 ST, and 185 SE 13 ST, MIAMI, FL (folio: 01-4139-035-0010)\n1221 BRICKELL HOLDINGS LLC Holding company for 1221 BRICKELL, LLC\n1229 PARK WEST AVENUE, LLC Holding company for 1229 W. PARK AVE, LIBERTYVILLE, IL (PIN: 11-20-100-010-0000)\n125 WORTH LLC Delaware LLC (file 7218403), Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person; speculaton this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC, this property is next door (PCN: 50-43-43-23-05-016-0380)\n125 WORTH HOLDINGS LLC Delaware LLC (file 7218407); not registered to Florida yet but speculation this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC\n1250 BB ASSET CO LLC Holding company for 1250 BRICKELL BAY DR and 1260 BRICKELL BAY DR, MIAMI, FL (folio nos: 102100504250, 102100503210)\n1330 SOUTH OCEAN LLC Holding company for 1330 S OCEAN BLVD, PALM BEACH, FL (PCN: 50-43-44-02-11-000-0020)\n14 STAR ISLAND LLC Delaware LLC (file 3377653); incorporated 8/42020, withdrawn 10/10/2022; believe this was not used because 14 STAR ISLAND property was held by NAUTILUS HOLDINGS I LLC before sale on 10/5/2022\n151 WORTH, LLC Holding company for 151 WORTH AVE, PALM BEACH, FL 33480 (PCN: 50-43-43-23-05-016-0130); office space for Citadel (https://localtoday.news/fl/citadel-moves-into-palm-beachs-former-neiman-marcus-building-4821.html); sole member is 151 WORTH HOLDINGS LLC\n151 WORTH HOLDINGS LLC Holding company for 151 WORTH, LLC\n16 WILLOW HOLDINGS LLC f/k/a PVNAH LLC Holding company for S WILLOW COURT, ASPEN, CO (Parcel: 273511309030); see Pitkin Co. reception # 623002, Delaware certificate showing name change 9/1/2015\n190 PFISTER HOLDINGS LLC f/k/a AH2013 HOLDINGS LLC Holding company for 190 PFISTER DR, ASPEN, CO (parcel: 273511309029); see Pitkin Co.reception # 623000, Delaware certificate showing name change 9/1/2015\n196 PFISTER HOLDINGS LLC Holding company for 196 PFISTER DR, ASPEN, CO (parcel: 273511309028); see Pitkin Co. reception # 623501, statement of authority show KP HOLDINGS LLC as sole membe\n1ALPH LLC See ALPH LLC\n1BUSINESS GROUP LLC See BUSINESS GROUP LLC\n1GFS DESIGN LLC See GFS DESIGN LLC\n1GFS LLC See GFS LLC\n1MEDIA HOLDINGS LLC See MEDIA HOLDINGS LLC\n23174 NE 41ST PATH LLC Holding company for 23174 NE 41ST PATH #12, OKEECHOBEE, FL 34972 (Parcel: 1-01-35-35-0020-00000-0120); part of Pine Creek Sporting Club (www.pinecreeksportingclub.com) includes horse, shooting sports; sole member is KP HOLDINGS L.L.C.\n3031 BRICKELL LLC Holding company for 3031 BRICKELL AVE, MIAMI FL 33129 (Folio: 01-4139-001-2700); Sole member is KP HOLDINGS L.L.C.\n31 WILLOW HOLDINGS LLC f/k/a AP HOLDINGS I LLC Holding company for 31 NORTH WILLOW COURT, ASPEN, CO (Parcel: 273511309019); sold 7/6/2017; see Pitkin Co. reception # 623001, Delaware certificate showing name change 9/1/2015\n650 CASUARINA LLC Holding company for 650 CASUARINA CONCOURSE CORAL GABLES, FL (folio: 03-4132-019-0060) https://www.bizjournals.com/southflorida/news/2022/05/27/650-casuarina-concourse-coral-gables-sold.html\n650 MEADOW LANE 1 LP Holding company for 650 MEADOW LANE, VILLAGE OF SOUTHAMPTON, NY (Parcel ID 7478) (https://archive.is/h85yq)\n800 NORTH MICHIGAN HOLDINGS LLC Holding company for 800 N MICHIGAN AVE, UNITS 66 PH and 67 PH, CHICAGO, IL (Park Tower) (PINs: 17-03-231-018-1116, 17-03-231-018-1117); sole member is KP HOLDINGS LLC (see Cook County, IL doc # 1933315025); recently sold\n8565 OLD CUTLER LLC Holding company for 8565 OLD CUTLER RD, MIAMI, FL (folio: 03-4132-019-0020)\n9 WEST WALTON HOLDINGS LLC Holding company for 9 WEST WALTON STREET CONDOMINIUM UNITS 3500, 3600, 3700, and PH, CHICAGO, IL\nADRP LLC Delaware LLC, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin\nAH2013 HOLDINGS LLC See 190 PFISTER HOLDINGS LLC\nALPH LLC a/k/a 1ALPH LLC Formerly FAA registered plane N421AL\nAP HOLDINGS I LLC See 31 WILLOW HOLDINGS LLC\nARAGON INVESTMENTS LTD https://files.brokercheck.finra.org/firm/firm\\_45631.pdf\nASHLER CAPITAL LLC https://adviserinfo.sec.gov/firm/summary/148826\nASHLER CAPITAL MASTER FUND LTD https://www.sec.gov/Archives/edgar/data/1003078/000114420418014250/tv488357\\_sc13g.htm\nBANBURY LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBANBURY II LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBKGST LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC See BLOSSOM WAY HOLDINGS LLC\nBLACK WHEEL LLC Illinois LLC, registered 3/5/2014, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin\nBLOSSOM WAY HOLDINGS LLC f/k/a CPPB HOLDINGS LLC f/k/a BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC Holding company for 10 BLOSSOM WAY, 70 BLOSSOM WAY, and 1265 S OCEAN BLVD PALM BEACH, FL (PCNs: 50-43-44-02-10-000-0050, 50-43-44-02-10-000-0060, 50-43-44-02-10-000-0010)\nBRICKELL BAY HOLDINGS LLC Holding company for 1201 BRICKELL BAY, LLC\nBRICKELL LEASING LLC See "Subordination, Non-Disturbance, and Attornment Agreement"; Miami-Dade Clerk\'s File No.: 2022 R 938960, Group: 1. Kenneth C Griffin is sole member.\nCAAM MANAGEMENT LLC https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm\nCAISLEAN CAPITAL LTD NFA Pool ID P113537, ceased trading 3/31/2016\nCALC III LP https://www.sec.gov/edgar/browse/?CIK=1582652\nCALC IV LP https://www.sec.gov/edgar/browse/?CIK=1423043\nCALC V LP Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf',
+        'Simulate a conversation between the writer of this post, named /u/CruxHub, and the expert business investigator. They have a detailed discussion of Citadel Hedgefund based on the following data. Do not include the following data in the search query. \n\nData: Entity Name Purpose / Source\n1|101 PC HOLDINGS LLC|Holding company for Penthouse C at the Setai Miami Beach (folio: 02-3234-153-1160)|PC = Penthouse C \n2|11 STAR ISLAND LLC|Holding company for 10 STAR ISLAND DR, MIAMI BEACH, FL 33139 (folio: 02-4204-001-0100, 02-4204-001-0110) (lots 10, 11 and 12 of Star Island)| \n3|117 EAST PARK AVENUE, LLC|Holding company for 117 E. PARK AVE, LIBERTYVILLE, IL (PIN: 11-21-212-046-0000); subsequently sold.| \n4|1201 BRICKELL BAY, LLC|Holding company for 1201 BRICKELL BAY DR, MIAMI, FL (folio no: 141390710010)| \n5|1221 BRICKELL, LLC|Holding company for 1221 BRICKELL AVE, 155 SE 13 ST, 165 SE 13 ST, 175 SE 13 ST, and 185 SE 13 ST, MIAMI, FL (folio: 01-4139-035-0010)| \n6|1221 BRICKELL HOLDINGS LLC|Holding company for 1221 BRICKELL, LLC| \n7|1229 PARK WEST AVENUE, LLC|Holding company for 1229 W. PARK AVE, LIBERTYVILLE, IL (PIN: 11-20-100-010-0000)| \n8|125 WORTH LLC|Delaware LLC (file 7218403), Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person; speculaton this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC, this property is next door (PCN: 50-43-43-23-05-016-0380)| \n9|125 WORTH HOLDINGS LLC|Delaware LLC (file 7218407); not registered to Florida yet but speculation this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC| \n10|1250 BB ASSET CO LLC|Holding company for 1250 BRICKELL BAY DR and 1260 BRICKELL BAY DR, MIAMI, FL (folio nos: 102100504250, 102100503210)|BB = Brickell Bay \n11|1330 SOUTH OCEAN LLC|Holding company for 1330 S OCEAN BLVD, PALM BEACH, FL (PCN: 50-43-44-02-11-000-0020)| \n12|14 STAR ISLAND LLC|Delaware LLC (file 3377653); incorporated 8/42020, withdrawn 10/10/2022; believe this was not used because 14 STAR ISLAND property was held by NAUTILUS HOLDINGS I LLC before sale on 10/5/2022| \n13|151 WORTH, LLC|Holding company for 151 WORTH AVE, PALM BEACH, FL 33480 (PCN: 50-43-43-23-05-016-0130); office space for Citadel (https://localtoday.news/fl/citadel-moves-into-palm-beachs-former-neiman-marcus-building-4821.html); sole member is 151 WORTH HOLDINGS LLC| \n14|151 WORTH HOLDINGS LLC|Holding company for 151 WORTH, LLC| \n15|16 WILLOW HOLDINGS LLC f/k/a PVNAH LLC|Holding company for S WILLOW COURT, ASPEN, CO (Parcel: 273511309030); see Pitkin Co. reception # 623002, Delaware certificate showing name change 9/1/2015| \n16|190 PFISTER HOLDINGS LLC f/k/a AH2013 HOLDINGS LLC|Holding company for 190 PFISTER DR, ASPEN, CO (parcel: 273511309029); see Pitkin Co.reception # 623000, Delaware certificate showing name change 9/1/2015| \n17|196 PFISTER HOLDINGS LLC|Holding company for 196 PFISTER DR, ASPEN, CO (parcel: 273511309028); see Pitkin Co. reception # 623501, statement of authority show KP HOLDINGS LLC as sole membe| \n18|1ALPH LLC|See ALPH LLC| \n19|1BUSINESS GROUP LLC|See BUSINESS GROUP LLC| \n20|1GFS DESIGN LLC|See GFS DESIGN LLC| \n21|1GFS LLC|See GFS LLC| \n22|1MEDIA HOLDINGS LLC|See MEDIA HOLDINGS LLC| \n23|23174 NE 41ST PATH LLC|Holding company for 23174 NE 41ST PATH #12, OKEECHOBEE, FL 34972 (Parcel: 1-01-35-35-0020-00000-0120); part of Pine Creek Sporting Club (www.pinecreeksportingclub.com) includes horse, shooting sports; sole member is KP HOLDINGS L.L.C.| \n24|3031 BRICKELL LLC|Holding company for 3031 BRICKELL AVE, MIAMI FL 33129 (Folio: 01-4139-001-2700); Sole member is KP HOLDINGS L.L.C.| \n25|31 WILLOW HOLDINGS LLC f/k/a AP HOLDINGS I LLC|Holding company for 31 NORTH WILLOW COURT, ASPEN, CO (Parcel: 273511309019); sold 7/6/2017; see Pitkin Co. reception # 623001, Delaware certificate showing name change 9/1/2015| \n26|650 CASUARINA LLC|Holding company for 650 CASUARINA CONCOURSE CORAL GABLES, FL (folio: 03-4132-019-0060) https://www.bizjournals.com/southflorida/news/2022/05/27/650-casuarina-concourse-coral-gables-sold.html|" \n27|650 MEADOW LANE 1 LP|Holding company for 650 MEADOW LANE, VILLAGE OF SOUTHAMPTON, NY (Parcel ID 7478) (https://archive.is/h85yq)| \n28|800 NORTH MICHIGAN HOLDINGS LLC|Holding company for 800 N MICHIGAN AVE, UNITS 66 PH and 67 PH, CHICAGO, IL (Park Tower) (PINs: 17-03-231-018-1116, 17-03-231-018-1117); sole member is KP HOLDINGS LLC (see Cook County, IL doc # 1933315025); recently sold| \n29|8565 OLD CUTLER LLC|Holding company for 8565 OLD CUTLER RD, MIAMI, FL (folio: 03-4132-019-0020)| \n30|9 WEST WALTON HOLDINGS LLC|Holding company for 9 WEST WALTON STREET CONDOMINIUM UNITS 3500, 3600, 3700, and PH, CHICAGO, IL| \n31|ADRP LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin|ADRP = Anne Dias Real Property? \n32|AH2013 HOLDINGS LLC|See 190 PFISTER HOLDINGS LLC|AH = Aspen Holdings? \n33|ALPH LLC a/k/a 1ALPH LLC|Formerly FAA registered plane N421AL| \n34|AP HOLDINGS I LLC|See 31 WILLOW HOLDINGS LLC|AP = Aspen Property? \n35|ARAGON INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_45631.pdf| \n36|ASHLER CAPITAL LLC|https://adviserinfo.sec.gov/firm/summary/148826| \n37|ASHLER CAPITAL MASTER FUND LTD|https://www.sec.gov/Archives/edgar/data/1003078/000114420418014250/tv488357\\_sc13g.htm| \n38|BANBURY LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n39|BANBURY II LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n40|BKGST LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n41|BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC|See BLOSSOM WAY HOLDINGS LLC|Black Calabash is a type of tropical tree: https://edis.ifas.ufl.edu/publication/ST079 \n42|BLACK WHEEL LLC|Illinois LLC, registered 3/5/2014, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin| \n43|BLOSSOM WAY HOLDINGS LLC f/k/a CPPB HOLDINGS LLC f/k/a BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC|Holding company for 10 BLOSSOM WAY, 70 BLOSSOM WAY, and 1265 S OCEAN BLVD PALM BEACH, FL (PCNs: 50-43-44-02-10-000-0050, 50-43-44-02-10-000-0060, 50-43-44-02-10-000-0010)| \n44|BRICKELL BAY HOLDINGS LLC|Holding company for 1201 BRICKELL BAY, LLC| \n45|BRICKELL LEASING LLC|See "Subordination, Non-Disturbance, and Attornment Agreement"; Miami-Dade Clerk\'s File No.: 2022 R 938960, Group: 1. Kenneth C Griffin is sole member.| \n46|CAAM MANAGEMENT LLC|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm|CAAM = Citadel Alternative Asset Management \n47|CAISLEAN CAPITAL LTD|NFA Pool ID P113537, ceased trading 3/31/2016| \n48|CALC III LP|https://www.sec.gov/edgar/browse/?CIK=1582652| \n49|CALC IV LP|https://www.sec.gov/edgar/browse/?CIK=1423043| \n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009|',
+        'Web search results:\n\n[1] "As per the Oxford Dictionary, a chatbot is defined as A computer program designed to simulate conversation with human users, especially over the internet. It can be looked upon as a virtual assistant that communicates with users via text messages and helps businesses in getting close to their customers."\nURL: https://www.datacamp.com/tutorial/building-a-chatbot-using-chatterbot\n\n[2] "Python , A chatbot is a computer program designed to simulate conversation with human users, especially over the internet. Create a fortune teller program that will ask the user to input a question and feedback some random answer. Consider the following feedback to be used. No idea at all! Better pray. The possibilities are in your favor."\nURL: https://www.chegg.com/homework-help/questions-and-answers/python-chatbot-computer-program-designed-simulate-conversation-human-users-especially-inte-q78825383\n\n[3] "It was created by Joseph Weizenbaum in 1966 and it uses pattern matching and substitution methodology to simulate conversation. The program was designed in a way that it mimics human conversation. The Chatbot ELIZA worked by passing the words that users entered into a computer and then pairing them to a list of possible scripted responses."\nURL: https://onlim.com/en/the-history-of-chatbots/\n\n[4] "Study with Quizlet and memorize flashcards containing terms like Which analytics does the following fall into: Alice notice that call center always have an increase in the number of customer complaints during last week in May, so she decides reviews the employees work schedule in the month of May for the past 5 years., Datasets continue to become, Model used for predictive analytic have ..."\nURL: https://quizlet.com/415587939/big-data-final-exam-flash-cards/\n\n[5] "As every bright side has a darker version, simulation of human conversation through AI also has some disadvantages like high cost of creation, unemployment, interaction lacking emotion, and out-of-the-box thinking. However, AI interaction tools are trained with a data set. The bigger the data set, the better the services."\nURL: https://www.analyticsinsight.net/simulating-human-conversations-through-ai/\n\n[6] "The eavesdropper, Eve intercepts the encrypted conversation and tries random keys with the aim of learning the conversation shared between Alice and Bob as shown in Fig. 7. For this POC, we used ..."\nURL: https://www.researchgate.net/figure/A-A-simulation-of-conversations-between-Alice-and-her-friend-Bob-B-The-eavesdropper\\_fig3\\_334408170\n\n[7] "Dreams are most often reported when sleepers wake from \\_\\_\\_\\_\\_ sleep. REM. The brain waves during REM sleep MOST closely resemble those seen during: waking consciousness. REM sleep is paradoxical because: the brain is active, but the major skeletal muscles are paralyzed. Fatigue and pain reflect deprivation of \\_\\_\\_\\_\\_ sleep."\nURL: https://quizlet.com/78519058/psyc-test-2-flash-cards/\n\n[8] "You can generate easily a fake group chat conversation like Whatsapp, Facebook or Telegram. After creating members/users, you can add messages in your chat. Once all messages are set up, you have the possibility to live-preview the chat conversation via the play button. Until the share functionality is ready, you have the option to screen ..."\nURL: https://chat-simulator.com/\n\n[9] "This is a program that allows the computer to simulate conversation with a human being: answer choices a. Speech Application Program Interface b. Chatbot c. Voice Recognition d. Speech Recognition Question 7 30 seconds Report an issue Q. This is a system of Programs and Data-Structures that mimics the operation of the human brain: answer choices a."\nURL: https://quizizz.com/admin/quiz/5f183913423fab001b0bd134/ai-unit-1\n\n[10] "This is a system of Programs and Data-Structures that mimics the operation of the human brain: answer choices a. Intelligent Network b. Decision Support System c. Neural Network d. Genetic Programming Question 8 30 seconds Q. Where is Decision tree used? answer choices a. Classification Problem b. Regression Problem c. Clustering Problem d."\nURL: https://quizizz.com/admin/quiz/5f6d6e4a6e2458001be385f5/ai-class-9\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: Simulate a conversation between Alice and /u/CruxHub. They talk about which company from the data batches is worth researching further into on the web.',
+        'Simulate a conversation between Alice and /u/CruxHub. They talk about which company from this data batch is worth researching further into on the web.\n\nData batch: Entity Name Purpose / Source Hypothesized Acronym\n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009| \n58|CE TM HOLDINGS LLC f/k/a KCG IP HOLDINGS LLC|Holding company for intellectual property (25 trademarks, 1 patent found so far)|CE TM = Citadel Enterprise Trademark Holdings \n59|CEF OFFSHORE HOLDINGS LTD|NFA Pool ID P131121| \n60|CEIF INTERNATIONAL LTD|NFA Pool ID P048476; http://registers.centralbank.ie/ICAVDocuments/C439830/Director%20Details%20Updated%2021.01.07%203.pdf| \n61|CEIF LLC|NFA Pool ID P048474| \n62|CEIF PARTNERS INTERNATIONAL LTD|NFA Pool ID P173278| \n63|CEIF PARTNERS LLC|NFA Pool ID P048475| \n64|CES SECURITIES CANADA ULC|See CITADEL SECURITIES CANADA ULC, CSA NRD # 49280| \n65|CFPS HOLDINGS S.\u00e0 r.l.|Luxembourg - B176936; 100% owned by CITADEL ENERGY INVESTMENTS LTD| \n66|CGE ALPHA LTD|NFA Pool ID P057309, ceased trading 6/7/2017| \n67|CGE ALPHA OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064400, ceased trading 4/30/2017| \n68|CGEF OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064406, ceased trading 2/21/2019| \n69|CGEF SPC|NFA Pool ID P064408, ceased trading 12/31/2012| \n70|CGMF OFFSHORE HOLDINGS LTD|NFA Pool ID P064410, ceased trading 3/31/2014| \n71|CGTS HOLDINGS S.\u00e0 r.l.|Luxembourg - B157777; 100% owned by TACTICAL TRADING HOLDING LTD; NFA Pool ID P064412, ceased trading 9/30/2014| \n72|CHARAXES MELVIN LLC|Sole member of CHARAXES MELVIN II LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n73|CHARAXES MELVIN II LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is CHARAXES MELVIN LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n74|CHI2LTV LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n75|CIG(E) LLP|See CITADEL EUROPE LLP| \n76|CIG CANADA ULC|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n77|CIG MEDIA LLC|https://www.sec.gov/Archives/edgar/data/923877/000114420407003635/v063478\\_sc-13d.htm| \n78|CITADEL AAM LP|https://www.sec.gov/Archives/edgar/vprr/0804/08040017.pdf| \n79|CITADEL AC INVESTMENTS LTD|https://www.sec.gov/Archives/edgar/data/1015780/000114420408032074/v115701\\_sc13da.htm| \n80|CITADEL ADVISORS EUROPE LIMITED f/k/a CITADEL MANAGEMENT (EUROPE) LIMITED f/k/a CITADEL HEDGE FUND SERVICES (EUROPE) LIMITED|https://find-and-update.company-information.service.gov.uk/company/10930267| \n81|CITADEL ADVISORS HOLDINGS LP|Sole member of CITADEL ADVISORS LLC; https://www.sec.gov/Archives/edgar/data/1567180/000110465922099806/xslF345X03/tm2225817-2\\_4.xml| \n82|CITADEL ADVISORS HOLDINGS II LP|https://www.sec.gov/Archives/edgar/data/1177609/000114420416082613/v429844\\_sc13ga.htm| \n83|CITADEL ADVISORS HOLDINGS III LP|https://www.sec.gov/Archives/edgar/data/1640129/000114420415043739/xslF345X02/v416000\\_3.xml| \n84|CITADEL ADVISORS LLC|NFA ID: 0391913; https://www.sec.gov/edgar/browse/?CIK=1423053| \n85|CITADEL ADVISORS II LLC|| \n86|CITADEL ADVISORS SINGAPORE PTE. LIMITED|| \n87|CITADEL ALTERNATIVE ASSET MANAGEMENT LP|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm| \n88|CITADEL AMERICAS LLC|| \n89|CITADEL AMERICAS SERVICES LLC|| \n90|CITADEL ANTAEUS INTERNATIONAL INVESTMENTS LTD|| \n91|CITADEL ASIA ASSET HOLDING LIMITED|http://registers.centralbank.ie/ICAVDocuments/C157189/Director%20Details%20Updated%2016.10.31%202.pdf| \n92|CITADEL ASIA LIMITED f/k/a CITADEL (HONG KONG) LIMITED|https://adviserinfo.sec.gov/firm/summary/148826| \n93|CITADEL CANDLESTICK EIF LLC|| \n94|CITADEL CANTERBURY S.\u00e0 r.l.|Luxembourg - B87988; 100% owned by CITADEL TONBRIDGE S.\u00e0 r.l.| \n95|CITADEL CEFL CHINA LTD|NFA Pool ID P148073| \n96|CITADEL CEFL INVESTMENTS LTD|NFA Pool ID: P161763; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n97|CITADEL CEIT CHINA LTD|| \n98|CITADEL CEMF CHINA LTD|https://find-and-update.company-information.service.gov.uk/company/02263951/charges/x6zPQSYGNpuDNgxU1cFQlCS0iog| \n99|CITADEL CEMF INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n100|CITADEL CEMF SPV LTD f/k/a CITADEL INVESTMENT MASTER FUND LTD|See CITADEL INVESTMENT MASTER FUND LTD; https://opencorpdata.com/lei/LF0U6QUBXKIO573GXS38|',
+        'Simulate a conversation between Alice and /u/CruxHub. /u/CruxHub asks Alice to anlalyze a data batch for non-standard insights.\n\nData batch: Entity Name Purpose / Source Hypothesized Acronym\n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009| \n58|CE TM HOLDINGS LLC f/k/a KCG IP HOLDINGS LLC|Holding company for intellectual property (25 trademarks, 1 patent found so far)|CE TM = Citadel Enterprise Trademark Holdings \n59|CEF OFFSHORE HOLDINGS LTD|NFA Pool ID P131121| \n60|CEIF INTERNATIONAL LTD|NFA Pool ID P048476; http://registers.centralbank.ie/ICAVDocuments/C439830/Director%20Details%20Updated%2021.01.07%203.pdf| \n61|CEIF LLC|NFA Pool ID P048474| \n62|CEIF PARTNERS INTERNATIONAL LTD|NFA Pool ID P173278| \n63|CEIF PARTNERS LLC|NFA Pool ID P048475| \n64|CES SECURITIES CANADA ULC|See CITADEL SECURITIES CANADA ULC, CSA NRD # 49280| \n65|CFPS HOLDINGS S.\u00e0 r.l.|Luxembourg - B176936; 100% owned by CITADEL ENERGY INVESTMENTS LTD| \n66|CGE ALPHA LTD|NFA Pool ID P057309, ceased trading 6/7/2017| \n67|CGE ALPHA OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064400, ceased trading 4/30/2017| \n68|CGEF OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064406, ceased trading 2/21/2019| \n69|CGEF SPC|NFA Pool ID P064408, ceased trading 12/31/2012| \n70|CGMF OFFSHORE HOLDINGS LTD|NFA Pool ID P064410, ceased trading 3/31/2014| \n71|CGTS HOLDINGS S.\u00e0 r.l.|Luxembourg - B157777; 100% owned by TACTICAL TRADING HOLDING LTD; NFA Pool ID P064412, ceased trading 9/30/2014| \n72|CHARAXES MELVIN LLC|Sole member of CHARAXES MELVIN II LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n73|CHARAXES MELVIN II LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is CHARAXES MELVIN LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n74|CHI2LTV LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n75|CIG(E) LLP|See CITADEL EUROPE LLP| \n76|CIG CANADA ULC|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n77|CIG MEDIA LLC|https://www.sec.gov/Archives/edgar/data/923877/000114420407003635/v063478\\_sc-13d.htm| \n78|CITADEL AAM LP|https://www.sec.gov/Archives/edgar/vprr/0804/08040017.pdf| \n79|CITADEL AC INVESTMENTS LTD|https://www.sec.gov/Archives/edgar/data/1015780/000114420408032074/v115701\\_sc13da.htm| \n80|CITADEL ADVISORS EUROPE LIMITED f/k/a CITADEL MANAGEMENT (EUROPE) LIMITED f/k/a CITADEL HEDGE FUND SERVICES (EUROPE) LIMITED|https://find-and-update.company-information.service.gov.uk/company/10930267| \n81|CITADEL ADVISORS HOLDINGS LP|Sole member of CITADEL ADVISORS LLC; https://www.sec.gov/Archives/edgar/data/1567180/000110465922099806/xslF345X03/tm2225817-2\\_4.xml| \n82|CITADEL ADVISORS HOLDINGS II LP|https://www.sec.gov/Archives/edgar/data/1177609/000114420416082613/v429844\\_sc13ga.htm| \n83|CITADEL ADVISORS HOLDINGS III LP|https://www.sec.gov/Archives/edgar/data/1640129/000114420415043739/xslF345X02/v416000\\_3.xml| \n84|CITADEL ADVISORS LLC|NFA ID: 0391913; https://www.sec.gov/edgar/browse/?CIK=1423053| \n85|CITADEL ADVISORS II LLC|| \n86|CITADEL ADVISORS SINGAPORE PTE. LIMITED|| \n87|CITADEL ALTERNATIVE ASSET MANAGEMENT LP|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm| \n88|CITADEL AMERICAS LLC|| \n89|CITADEL AMERICAS SERVICES LLC|| \n90|CITADEL ANTAEUS INTERNATIONAL INVESTMENTS LTD|| \n91|CITADEL ASIA ASSET HOLDING LIMITED|http://registers.centralbank.ie/ICAVDocuments/C157189/Director%20Details%20Updated%2016.10.31%202.pdf| \n92|CITADEL ASIA LIMITED f/k/a CITADEL (HONG KONG) LIMITED|https://adviserinfo.sec.gov/firm/summary/148826| \n93|CITADEL CANDLESTICK EIF LLC|| \n94|CITADEL CANTERBURY S.\u00e0 r.l.|Luxembourg - B87988; 100% owned by CITADEL TONBRIDGE S.\u00e0 r.l.| \n95|CITADEL CEFL CHINA LTD|NFA Pool ID P148073| \n96|CITADEL CEFL INVESTMENTS LTD|NFA Pool ID: P161763; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n97|CITADEL CEIT CHINA LTD|| \n98|CITADEL CEMF CHINA LTD|https://find-and-update.company-information.service.gov.uk/company/02263951/charges/x6zPQSYGNpuDNgxU1cFQlCS0iog| \n99|CITADEL CEMF INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n100|CITADEL CEMF SPV LTD f/k/a CITADEL INVESTMENT MASTER FUND LTD|See CITADEL INVESTMENT MASTER FUND LTD; https://opencorpdata.com/lei/LF0U6QUBXKIO573GXS38|',
+        'Web search results:\n\n[1] "Katherine Burton Hedge fund titans Ken Griffin and Steve Cohen boosted Gabe Plotkins Melvin Capital, injecting a total of $2.75 billion into the firm after it lost about 30% this year. Citadel..."\nURL: https://www.bloomberg.com/news/articles/2021-01-25/citadel-point72-to-invest-275-billion-in-melvin-capital\n\n[2] "NEW YORK, Jan. 25, 2021 /PRNewswire/ -- Melvin Capital Management (Melvin) today announced that Citadel and its partners and Point72 have made investments into its fund. I am incredibly..."\nURL: https://www.prnewswire.com/news-releases/melvin-announces-2-75-billion-investment-from-citadel-and-point72--301214477.html\n\n[3] "Citadel LLC is further paring back its $2 billion investment in Melvin Capital Management after the hedge fund stumbled in its effort to recover from a near collapse triggered by surges in..."\nURL: https://www.wsj.com/articles/citadel-is-further-paring-back-2-billion-melvin-investment-11645710666\n\n[4] "Citadel and Steven A. Cohen s Point72 Asset Management together invested $2.75 billion into Melvins hedge fund on Jan. 25 as Melvin was hemorrhaging money. In return for the rare..."\nURL: https://www.wsj.com/articles/citadel-to-redeem-about-500-million-from-melvin-capital-11629550410\n\n[5] "CHARAXES MELVIN LLC is an Active company incorporated on August 5, 2022 with the registered number M22000012341. This Foreign Limited Liability company is located at SOUTHEAST FINANCIAL CENTER, 200 S. BISCAYNE BLVD., SUITE 3300, MIAMI, 33131 and has been running for one year. ... CITADEL SECURITIES GP LLC; KCG SPACE HOLDINGS LLC;"\nURL: https://bisprofiles.com/fl/charaxes-melvin-m22000012341\n\n[6] "Now, Citadel is taking some of its money back. Citadel has notified Melvin of its plans to retrieve $500 million of the $2 billion it injected in late January, according to two people briefed..."\nURL: https://www.nytimes.com/2021/08/21/business/citadel-melvin-gamestop.html\n\n[7] "Robinhood and Citadels relationship comes into focus as Washington vows to examine stock-market moves Trading firms at center of Reddit-fueled stock surges have worked closely to share..."\nURL: https://www.washingtonpost.com/business/2021/01/29/robinhood-citadel-gamestop-reddit/\n\n[8] "Alongside hedge funds such as Melvin Capital, Citron Capital, Point72, D1 Capital Partners, and Candlestick Capital Management; Citadel LLC was, the lawsuit claims, taking up short positions against the securities that retail investors were longing. This alleged conflict of interest is at the core of the class action lawsuit."\nURL: https://tokenist.com/new-lawsuit-alleges-citadel-conspired-with-robinhood-to-limit-gme-trading/\n\n[9] "Melvin later attracted an additional $3.2 billion in fresh cash, and the firm had $11.7 billion in assets at the beginning of this year. Point72 hasnt redeemed its investment, a person familiar ..."\nURL: https://www.chicagobusiness.com/finance-banking/ken-griffins-citadel-pulling-back-most-its-2-billion-melvin-capital-investment\n\n[10] "CHARAXES MELVIN II LLC branch. Company Number M22000012338 Status Active Incorporation Date 5 August 2022 (2 months ago) Company Type Foreign Limited Liability Jurisdiction Florida (US) Branch Branch of CHARAXES MELVIN II LLC (Delaware (US)) Agent Name C T CORPORATION SYSTEM Agent Address"\nURL: https://opencorporates.com/companies/us\\_fl/M22000012338\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is Charaxes Melvin LLC\'s relationship to Citadel?',
+        'Web search results:\n\n[1] "Federal authorities are investigating the market-making arms of Citadel LLC and KCG Holdings Inc, looking into the possibility that the two giants of electronic trading are giving small investors ..."\nURL: https://www.reuters.com/article/usa-stocks-probe-idUSL2N1871ZV\n\n[2] "Today, KCG is second only to Citadel in the market for handling stock order flow from retail brokerage firms. KCG and many other high-frequency trading firms have shied away from the public..."\nURL: https://www.ibtimes.com/citadel-llc-kcg-holdings-kcg-market-making-arms-probed-federal-authorities-over-stock-2366805\n\n[3] "Citadel Securities, a group owned by the Chicago-based hedge fund, is the would-be acquirer in the deal, the people said. The group is best known for its so-called wholesaler business that..."\nURL: https://www.wsj.com/articles/market-making-arm-of-citadel-llc-in-talks-to-buy-seats-on-nyse-floor-from-kcg-holdings-1454533971\n\n[4] "Citadels share of the wholesale market is around 34 per cent compared to KCGs 25 per cent, according to Tabb Group. Virtu has yet to lay out in detail its plans for the wholesale business ..."\nURL: https://www.ft.com/content/e1cb396e-29a7-11e7-bc4b-5528796fe35c\n\n[5] "Citadel Securities, a liquidity providers and market maker, announced it will purchase KCG Holdings designated market maker (DMM) business at the New York Stock Exchange. This will establish Citadel Securities as the DMM with the largest footprint on the NYSE, responsible for trading in approximately 1,500 issues."\nURL: https://www.tradersmagazine.com/departments/brokerage/citadel-purchases-kcg-dmm-business-becomes-1-on-nyse/\n\n[6] "isCitadel LLC and its related entity, KCG IP Holdings, LLC (Complainant), represented by Paul D. McGradyof Winston Strawn, Illinois, Respondent is- (Respondent), Alabama, USA. REGISTRAR AND DISPUTED DOMAIN NAME The domain name at issue iscitidelgroup.com, registered with TUCOWS, INC. PANEL The"\nURL: https://www.adrforum.com/domaindecisions/1522837.htm\n\n[7] "KCG SPACE HOLDINGS LLC is an Active company incorporated on July 21, 2022 with the registered number M22000011413. This Foreign Limited Liability company is located at 200 S BISCAYNE BLVD STE 3300, MIAMI, FL, 33131, US and has been running for one year. It currently has one Authorized Person. KEY FACTS ABOUT KCG SPACE HOLDINGS LLC US Businesses"\nURL: https://bisprofiles.com/fl/kcg-space-holdings-m22000011413\n\n[8] "The Complainant KCG IP Holdings LLC is the owner of US Trademark Registration No. 3,213,943, filed October 18, 2004, registered February 27, 2007, claiming first use dating back to 1994. Therefore, the Panel concludes that Complainants filing and registration of the CITADEL mark with the USPTO sufficiently demonstrates that it has rights in ..."\nURL: https://www.adrforum.com/domaindecisions/1579141.htm\n\n[9] "The KCG SPACE HOLDINGS LLC principal address is 200 S BISCAYNE BLVD STE 3300, MIAMI, 33131. Meanwhile you can send your letters to 200 S BISCAYNE BLVD STE 3300, MIAMI, FL, 33131. The company`s registered agent is C T CORPORATION SYSTEM 1200 SOUTH PINE ISLAND ROAD, PLANTATION, FL, 33324. The company`s management are A, President - Beeson Gerald A."\nURL: https://florida.intercreditreport.com/company/kcg-space-holdings-llc-m22000011413\n\n[10] "Billionaire Ken Griffin has built Citadel Securities into a trading and asset management colossus. ... and KCG Holdings. Last month, Citadel Securities reached an agreement with the SEC to pay $22 ..."\nURL: https://www.chicagobusiness.com/article/20170203/NEWS01/170209978/chicago-billionaire-ken-griffin-splits-citadel-into-two-companies\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is KCG Space Holdings LLC\'s relationship to Citadel?',
+        'Web search results:\n\n[1] "Citadel LLC (formerly known as Citadel Investment Group, LLC) is an American multinational hedge fund and financial services company. Founded in 1990 by Ken Griffin, it has more than $50 billion in assets under management as of May 2022. [1]"\nURL: https://en.wikipedia.org/wiki/Citadel\\_LLC\n\n[2] "NASHVILLE, Tenn. and BRONXVILLE, N.Y. \u2014 Standard Media Group LLC (Standard Media) and Citadel Communications LLC (Citadel) jointly announced today that they have reached an agreement pursuant to which Standard Media will acquire from Citadel WLNE-TV, the ABC affiliate for the Providence, RI - New Bedford, MA market (DMA 52) and KLKN (TV), the \u2026"\nURL: https://www.standardmedia.com/2019/05/16/standard-media-group-to-acquire-citadel-stations/\n\n[3] "CITADEL MEDIA LLC. Citadel Media LLC is a New Hampshire Domestic Limited-Liability Company filed on February 6, 2021. The companys filing status is listed as Not In Good Standing and its File Number is 862423. The Registered Agent on file for this company is Peter Alan Gauthier and is located at 3 Maple Ridge Drive Unit 224, Merrimack, NH 03054."\nURL: https://www.bizapedia.com/nh/citadel-media-llc.html\n\n[4] "CITADEL MEDIA LLC is a Michigan Domestic Limited-Liability Company filed on November 16, 2017. The companys filing status is listed as Active and its File Number is 802132896. The Registered Agent on file for this company is Registered Agents Inc. and is located at 2222 W. Grand River Ave Ste A, Okemos, MI 48864. The companys mailing address ..."\nURL: https://www.bizapedia.com/mi/citadel-media-llc.html\n\n[5] "Citadel Broadcasting Corporation was a Las Vegas, Nevada -based broadcast holding company. Citadel owned 243 radio stations across the United States and was the third-largest radio station owner in the country. Only iHeartMedia and Cumulus Media owned more stations prior to Citadels merger with Cumulus."\nURL: https://en.wikipedia.org/wiki/Citadel\\_Broadcasting\n\n[6] "Citadel is one of the largest hedge fund managers in the world. And theyve subsequently managed Melvin Capital to the ground. Melvin Capital suffered a loss of over 50% its first quarter in 2021 due to shorting AMC Entertainment and GameStop. At some point youd expect your clearing house to raise awareness on your risk management right?"\nURL: https://franknez.com/citadel-loses-billions-hedge-funds-are-getting-dragged-down/\n\n[7] "At our core, Citadel is built to deliver excellence. We have some of the most talented and focused minds in the industry, and we activate their ideas and strategies through a robust range of proven technologies and execution capabilities. View Top Employees from Citadel LLC Looking for a particular Citadel LLC employees phone or email? Find Info"\nURL: https://rocketreach.co/citadel-llc-profile\\_b5c46522f42e0dc2\n\n[8] "# 1 Most profitable hedge fund manager of all time Source: LCH Investment NV estimates, Top Hedge Fund Managers by Net Gains Since Inception as of 12/31/2022. Our people are relentless in seeking a better way. Each day, we reimagine and refine our strategies, models and technology in pursuit of superior results and long-term performance."\nURL: https://www.citadel.com/\n\n[9] "We are one of the most significant alternative investment managers in the public U.S. corporate credit markets. Explore Credit Convertibles Equities Equities represents one of the largest and longest tenured businesses at Citadel. Explore Equities Global Fixed Income Macro We are a leading fixed income and macro business."\nURL: https://www.citadel.com/what-we-do/\n\n[10] "Citadel. 203,101 followers. 1mo. Last weekend, we celebrated Citadels 30th anniversary at an incredible event at Disney World and Universal Studios. Our founder and CEO Ken Griffin summarized ..."\nURL: https://www.linkedin.com/company/citadel-llc\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is CITADEL MEDIA LLC?',
+        "What are the differences between the Dogme approach to language learning and the lexical approach to language learning",
+        "Implement my own netfilter in linux with linux kernel module with Rust",
+        "Damage to which nerve causes numbness of the palmar surface of the 5th digit/little finger",
+        "Explain the fault-tolerance of the reaction control system on the Space Shuttle",
+        "Hi, can you help me download 2000 portrait sketch images from Pinterest website with resolution at least 512 \\* 512? using python code",
+        "Tell me about the negatives of farming meat",
+        "what is the photograph filter called where the only part of the image is greyscale",
+        "I want some geological database structure with some example data for practicing my SQL query skills. Would you generate that for me?",
+        "What is a formal but simplified explanation of Web marketing",
+        "Rewrite and improve this story: Well, I have always liked helping people since I was a small child, I have been accused many times of giving too much away for free, but I find joy in helping others put the pieces together to reach their goals. As a Licensed Professional Counselor and Life Coach that is my job to impact individuals and help clients work through emotional difficulties and reach goals. But I will be honest with you I was selling the dream but not always living the dream. I had issues I had not worked completely through like childhood trauma, heartbreak, disappointments, and frustrations with life. Don't get me wrong I had the husband, the kids, the house and the 6 figure job but I was not happy inside, but I didn't change because I hate change, most of us hate change, right? Then I lost my sister, my friend, and it slapped me in the face that I need to take care of myself. I saw the addiction, I saw her not taking care of herself and I could not save her. One thing I know for sure, if you do not make your wellness a priority illness will find you. I remember the moment we lost her, the earth stood still and then my heart broke into pieces, what was I going to do, I have loved her my whole life! It was months later that I made a decision that I would be the change I hope to see, I would create a space for women of color to move past the obstacles that keep us from creating the life we want and Brown Suga Wellness was born. I am on this journey and I invite you to be on this journey with me! I love this quote by Oludara Adeeyo: \"When you heal yourself, you create an earth shattering legacy. The lineage of women who come after you will be healed. Your inner circle of Black women around you, healed.\" When you choose yourself you break generational trauma and curses. You activate your ancestral strength. I invite you to activate that strength!",
+        "How would you ask these questions: Tell everyone a little about you, where you from, what you like doing?\nWhat goals are you pursuing right now?\nWho has made the most influence in your life?\nWho is the one person that you admire the most (alive or dead)?\nWhat is the hardest challenge you\u2019re had to overcome in your life?\nWhen have you grown the most in your life and what caused that growth?\nWhere is your favorite place to relax and renew?\nWhat books have changed your life the most?\nWhat Is the biggest thing that you want the audience to take away today?\nHow can people get a hold of you to talk about your business?",
+        "Take these topics into a numbered table and generate subtopics in seperated lines for each. Preconfigure these subtopics as lections of those several topics and add them to the table. Use numbers for topics and letters for subtopics. Set a status (untouched/touched) for every subtopic in 3. coloumn of the table to mark them done when finished learning this subtopic and topic. Use coloumn 4 of the table for a short resumee of the chapter. Showing the learning process in percentage in front of every new output is first. Show the Table and wait for any userinput to start lessons on those topics.;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;",
+        "Write a rap song about Mikkel Selko",
+        "list the largest outdoor retailers in the world",
+        "can you create a wordpress shortcode to include the following code from facebook sdk",
+        'Is this grammatically correct: "It only took 5 years, and while we still have a long way to go, Topher\u2019s Farm has found its place with unique experience and offering of organic produce. "',
+        "Hello friend. My task for today is to engage in a debate with you. Will you humor me in this regard?",
+        "You are an expert marketing consultant and copywriter with expertise is direct response marketing. I need your help. Can I tell you about my business?",
+        'here is part 1\n\n----\nDaySculpting is a program that that deals with YOUR immediate future\u2026.It is a 90 day program that teaches U how to create Success\u2026 one day at a time\u2026today\u2026\nUsing recent breakthroughs in the field of neuroscience, the study of the human brain, DaySculpting is one of the most powerful success systems on earth for creating what I call\u2026 \n"Your Epic Ideal Day" -- And when U have Epic Ideal Days? U create your EPIC IDEAL LIFE.\n\nDaySculpting is broken down into 3 easy to accomplish segments throughout your day\u2026\n~The Morning Lift Process\u2026which sets U up with a MindState of Success and a design for U to follow throughout your day\u2026There is a morning email\u2026SMS text\u2026Inspiring Video\u2026Future Forward Tuning IN\u2026And a 3 step Success Step Declaration Process\u2026this only takes 15 minutes\u2026\n~Mid-Day Reconnect Process\u2026whatever your miid-day is\u2026U are encouraged to stop doing what U are doing and disconnect so U can re-connect\u2026by listening to a 5-minute Tuning In Re-Connection. We know that somewhere in the middle of our day it\u2019s easy to lose momentum and drift from our best intentions because of all the demands on our attention. It has been scientifically proven that when U disconnent for between 3 to 5 minutes at the midpoint of your day\u2026.your brain resets\u2026and your energy is replenished\u2026I like to call it a MindState Re-Boot that will inspire U to re-ignite your imagination\u2026this only takes 5 minutes\n~Highlight And Insight Review Process\u2026we all review our day however what DaySculpting \nanchors for U is an activation and integration process that gets U to see your day as being successful\u2026by celebrating your successes (your highlights) and being present to things U could have improved on (your insights) so U can make your insights into highlights..most people when they review their day fail to celebrate even the smallest increments of success\u2026they focus on what they didn\u2019t do and that puts them in a negative energy\u2026Success has challenges and the\nhighlights and insight process encourages and empowers U to honestly see what U are doing each day so U Sculpt new MindStates Of Success rather than the energy of uncertainty\u2026\nthis takes 10 minutes\n\nThe whole DaySculpting process takes 30 minutes a day\u2026and as I always say if U don\u2019t have \n30 minutes to change your life then U don\u2019t want to change your life and U are okay with living \na mediocre life\u2026\n\nDay Sculpting is about targeting specific Chief Aims U have for your life\u2026and creating the Habits that will get U there\u2026Imagine being able to replace the MindTraps (your limiting beliefs) with empowering rituals and habits that become your new normal\u2026\n\nThrough the repetition of doing the daily DaySculpting process U are carving into your Subconscious memory thoughts, beliefs and actions that result in U sculpting the masterpiece known as U\u2026\n\nThere are many programs out there that attempt to instill new success behaviors however many fall short of actually shifting your MindStates into a frequency of possibility where U get to actually see your daily results immediately\u2026DaySculpting does this\u2026\n\nThis is not science fiction\u2026 and it\'s not wishful thinking, or some tired old self-improvement, goal-setting program\u2026 DaySculpting is a program that empowers U to manifest and realize your Chief Aims in life\n\n"DaySculpting" -- is a tool that takes just MINUTES a day for you to use\u2026\n\nIt is designed to FREE UP hours in your day\u2026 while at the SAME time empowering you for greater success in ANY area of your life.\n\nDaySculpting sheds light and solves an age-old problem:\nWHY we often fight against the very changes we desire to make\n\nHave you ever experienced the FEELING that you deserve MORE out of your life? More financial freedom and greater rewards from the hard work you do every day? Deeper, more empowering relationships with those you love\u2026 or maybe just meeting that special someone to share your life with? Perhaps you crave a deeper spiritual connection\u2026 or a more healthy, trim, energetic body?\u2026 \nYET:\nDespite your BEST intentions\u2026 you struggle. Perhaps if you\'re anything like me, you even self-sabotage your results with actions that you KNOW are not in your best interest.\n\nMaybe it FEELS like it did for me: Like you are swimming upstream\u2026 making SOME progress, sure, but just not reaching your goals and desires fast enough.\n\nWell, I have wonderful news for you: It\'s not because you\'re lazy\u2026 and it\'s not because you are not smart enough, competent enough\u2026 or ANYTHING enough! \n\nThe real REASON you desire more and are not seeing ALL the results you deserve lies within whether the Success Switch in your brain is in the ON or OFF position\u2026\n\nThe SOLUTION\u2026 THE ANSWER to flipping your Success Switch back ON lies within the simple daily steps U will take when U experience the DaySculpting Program\u2026 \nThe Day Sculpting Program Is A Simple Step Daily Success RITUAL \u2028 That Shuts Down Your Body\'s Failure Reflex \u2028 So YOU Tap Into Your Brains Success Centers\u2026\u2028 In Just Minutes A Day!\u2028\u2028 IIMAGINE Knowing What HIGHLY SUCCESSFUL \u2028 People Do EVERYDAY\u2026\nFor Abundance And Wealth, Greater Health, Self-Confidence Meaningful Relationships, Sharper Focus , Deeper Joy\u2026\u2028 And So Much More\u2026\n\u201cNow You Too Can Use This 90-Day Game Changer\u2028 To Tap Into The Key Success Centers Of Your Mind,\u2028 And In Just Minutes You Can Transform Even Lousy Days\u2028 Into Days Filled With The Results You Desire \u2013 Guaranteed!\u201d\nTO MAKE A GREAT LIFE, ALL YOU HAVE TO IS MAKE EACH DAY A GREAT DAY \u2026 \nThen get up tomorrow and do the same thing, day after day after day.\nARE YOU Ready To Change YOUR LIFE One Day At A Time\u2026\nThe comprehensive, fun and empowering 90-day DaySculpting program provides you with the life skills and tools to help you master a new MindState of Success and a range of powerful life-changing rituals and habits that will Sculpt Your Perfect Days Into A Great Life.\nDAY SCULPTING WILL TEACH YOU:\n\u2022 The science behind HAVING A MindState Of Success...and why most people who want more in life actually have their success switch turned off by total accident!\n\u2022 How to get more done with more time and more energy left over!\n\u2022 The simple, yet powerful, process of building a powerful day so you create a series of "Dynamic Days" - days that will end up building your most incredible life (The one you always thought was out of reach!)\n\u2022 Learn the \'Day Sculpting Principles\'. These can have a huge impact on you your life, but when you learn how simple they really are, you can use them easily and consistently!\n\u2022 How in just a few minutes a day, you can keep positive results flowing and put your success energy into a permanent \'ON\' position!\n\u2022 And much more!\nDaySculpting, is for those who are willing to take their life to the next level by creating new Success Habits replacing the ones that have been sabotaging your success. \nSo make sure you can honestly agree with the following before experiencing DaySculpting:\n\u2022 You desire more out of life, yet feel as if you are "missing something" -- that special "X Factor" to take you to the next level?\n\u2022 You are brave enough to boldly say, "I want greater wealth and financial freedom... and I demand the best lifestyle possible for me and my family!\n\u2022 You know the value of joy: You want to experience greater happiness, peace of mind, and connection with your friends and loved ones on a daily basis.\nIf you agree with the above, and truly want to create the best life possible, with greater wealth, freedom, happiness, love, and fulfillment, then I invite you to experience the power of Day Sculpting \u2026it will change the way you think about creating your day and the life you dream about. \nI am not encouraging you to become busier but rather to use your mental and emotional, energy more elegantly sculpting your day the way you want it to be. \nHow many times have you done a ton of work and still felt that you didn\u2019t accomplish what you really wanted for yourself. Week after week, month after month go by and you still are no farther ahead of the game\u2026stuck in the status quo that never seems to change.\n\nBreaking free means that the status quo of your life has to change\u2026 your habits of expectation have to change \u2026your mindset has to change\u2026you have to uncover those old behaviors that have held you back and be willing to create a new mindset.\n\nYou have to be willing to shift your daily focus inwards towards what you need to do today rather than tomorrow. Because when you create a great day today you welcome in a more powerful tomorrow.\n\nWe all have the same 24 hours each day. But why are some people building fabulous careers, achieving healthy lifestyles, enjoying great relationships and incomes, living their passions, and creating what they truly desire as a life?\n\nImagine that you could clear away the distractions that you unconsciously create. You know the stuff that consumes your time causes stress and disconnects you from your purpose and passion. \n\nImagine every day you embrace the energy for what you are choosing to create in your life. Your thoughts empower you, your choices inspire you and your actions create momentum, opportunity and possibility.\n\nYou can create a GREAT LIFE, the life you want to live by focusing your efforts on Creating a Great Day Today. That\u2019s Day Sculpting. Seven intentional sculpted days turn into a month of wonderful weeks and a year of magnificent months creating an amazingly successful life.\n\nNone of this is going to work though if you believe that what you were born with is all you will get\u2026\n\nNo one will ever attempt to do something when they are convinced that they will fail.\n\nResearch has shown that the brain will actually stop itself from doing what\u2019s necessary to succeed if a person believes that they cannot succeed.\n\nIt\u2019s the small concrete indicators of success today that will prove you can have whatever it is you want and the process of Day Sculpting will empowers, inspire and motivates you each step of the way.\n\nYou see: Confidence + Discipline = Desired Outcomes \n\nIt\u2019s time to stop looking at your life from a fear based I don\u2019t know how to mindset but rather be open to creating a solutions focused change consciousness that embraces your gift and talents and encourages you sharing them.\n\nLet me share a bit of nuero-chemistry with you\u2026\nWhat fires together wires together\u2026\n\nSo rather than Fall back on old habits\u2026\nTake the transitional step\u2026of being fully present to whats trying emerge as your ideal future and to help it along start building confidence each day\u2026\n\nAnd your possibility muscle and an intended thought process that leads to a more focused and clear out picturing of your desires.\n\nYou see...It\u2019s one thing to set goals and to make to do lists and to say your going to use the law of attraction to manifest what you want in life\u2026\n\nI\u2019m still looking at the many lists I have created.\n\nWhat it\u2019s really about is having a clear and purposeful intention in order to create the energy and the MindState Of success that will propel you into action.\n----\n\nWhen done ask me for part 2',
+        "Here is the final part. Part 3\n---\n\nHere we will be showing how the principles and practices we\u2019ve covered so far converge into one over-arching result that will benefit you for the rest of your life. You can think of it as flipping a switch that changes how you create new results in life one day at a time. This is at the very core of what we call Day Sculpting. \nThe simplest way to think of it is that most of the way we live is habitual. You have an habitual way of brushing your teeth, walking, talking to yourself and others, eating, working. Habits are wonderful\u2026they make life easy but they also limit you. For example, if you have a habit of eating too much, you\u2019ll put on weight. Not instantly, but steadily, day by day, until one day you have a weight problem. If you try to change your weight quickly through a trendy new diet, research shows that the weight is likely to come back, and then some, within a few short months, because the habits required to live at your ideal weight have not been properly established. \nHabits are habits because you don\u2019t think about them, they happen nonconsciously. If you want a change in your life, you have to embody the change at a nonconscious level, so that the habits keeping your life the way it is today begin to shift.\nWouldn\u2019t it be great if there was a switch in the brain that would move you from status quo to status GO!? This is a switch that once you flip it will produce the result you want, if you are willing to commit to and stay with the process.Day Sculpting is your guide to fully realizing the success you are ready to enjoy.\nA critically important capacity of the human mind called preconscious processing. This is the ability of the mind to receive information, beneath our conscious awareness, and act upon it without even knowing that it is happening. Used correctly, this is an amazing power. Used improperly, it will sabotage your best efforts and make life extremely difficult.\nMost of us think we are running the show with our conscious awareness, consciously choosing our thoughts, behaviors, and emotions and consequently, we believe are able to choose the results we create in life. However, what neuro-science research shows, is that we all have a vast nonconscious mind that is really running the show most of the time. That deeper part of us, in charge of our habitual thinking, feeling, and behaving is always operating in our best interest. But it does so using information that may be faulty or outdated. If you continue to feed it information that doesn\u2019t serve you, it will continue to habitually bring results that are less than desired.\nYour preconscious processor is constantly routing new information directly into this larger database that your mind uses to create new behaviors. Your job is to place the right information into this database every single day, so that it can draw upon this new data and create new results. It requires your vigilance and purposeful intention on a daily basis. Day Sculpting is the process to accomplish exactly that, getting you to focus one day at a time on what you are trying to create in your life today, and the future you truly desire. \nA lot of experts in the human development field teach information and then expect it will translate into new behaviors automatically. But as we\u2019ve pointed out, and as you\u2019ve probably experienced, consciously knowing something and having the nonconscious mind put it into a new behavior, are two entirely different processes. What we are sharing with you is how to bridge that gap. This is precisely why so many experts in the field are recommending Day Sculpting to their clients, to help them use momentum mindsets on a daily basis and apply the good information they teach. \nWe talk about The The Solutions Focus process . Try it out: \nThink of an area of your life in which you are actively attempting to create different results. Imagine your chief aim regarding this area of your life as a perfect future. Now imagine a scale from one to ten, where ten is the perfect future and one is that you have not even started thinking about your chief aim. On this imaginary scale from 1 to 10, where would you place yourself right now?\nGo ahead and imagine where would you place yourself right now on that scale, where ten is your perfect future.\nWhatever number you came up with is fine. Whether it was 3 or 7, whatever you came up with I\u2019ll always ask the same next question. \u201cWhy so high and not lower?\u201d\nLet\u2019s say, for example that you came up with a three. Asking the question \u201cWhy so High\u201d catches the mind off guard. Most people expect, \u201cOnly a 3! Why so low?\u201d If I had asked that what would you come up with? All the reasons why things aren\u2019t working, who is to blame, problems, excuses, lack, limitations, and so on. \nBut when I ask \u201cWhy so high?\u201d the brain immediately begins to sort for all of the things that are working for you, everything that has brought you up to a \u201cthree.\u201d If you said you are at a seven on a scale of one to ten, the same question applies: \u201cWhy so high and not lower?\u201d\nThe next step in solutions focus is equally powerful. \u201cThink about what you can do today to move you one point up that scale\u2014for example, from a three to a four, or from a seven to an eight?\u201d When you ask this, your mind instantaneously starts generating ideas and options to answer your question. You quickly realize you can do more of the things that work, right? And if you are doing things that aren\u2019t working, you now have the insight into how you can do things differently. \nThis solutions focus approach provides quick insight into how to move things forward in areas you may have been stuck or working on unsuccessfully. It is a brilliant way to access more of your nonconscious database and facilitate discovering resources you did not know were there. \nSo as you can see, this video has been centered on connecting the dots and providing you with the insights on how you can flip the switch in your brain and how you can create your life one day at a time in the most powerful way possible. \nYou must contact that inner part of you that is in charge of your habitual ways of thinking, feeling, and behaving in order to re-sculpt yourself.\nThis is a unique psychological principle called anchoring. In the research this is also called behavioral conditioning, and as we\u2019ve called it, the law of reinforcement\u2026which says you get more of what you reinforce. When you want to reinforce a positive new behavior, you anchor it in a positive new momentum mindset. As you do this on a daily basis, you are literally training your mind, conditioning your thoughts, amplifying positive feelings and emotions to live into a future state that you are anchoring in your daily experience. \nDay Sculpting goes beyond personal development. It takes whatever it is you are currently learning and makes it possible for you to embody, apply and enjoy the benefits you are committed to achieve. \n\nThe last thing anyone needs is more stuff to do. What we need is that everything we do gets us the results we are going for. In essence what\u2019s needed is a system that will streamline our efforts so we accomplish our chief aims in less time.\n\nMichaelangelo said the process of sculpting is to remove what\u2019s not supposed to be there. He had the mindset that the finished sculpture already existed in the marble and he just had to reveal it. In the same way your destiny already resides in you. You just need to clear a path for it to emerge.\n\nWe all have 24 hours in a day. So why do some people consistently have great days while others are up and down and stay stuck in mediocrity? It\u2019s a disciplined habit of how you approach everyday. Day Sculpting takes the same 24 hours that we all have and helps clarify your choices so that your actions reveal your highest destiny. \n\nIt is a quick, easy and effortless way that supports and empowers your efforts in achieving your chief aims. It creates the mindsets necessary to have successful days, weeks, months and years.\n\nDay Sculpting is a 90- day program designed to empower you to create your life ONE DAY AT A TIME. By committing 30 minutes each day to create what you want that day. \n\nWe believe that when you focus your actions one day at a time the results you get become measurable and achievable. Your energy is committed to channeling your efforts so you create a confident groove in your mind that empowers your habitual actions to create what you really want.\n\nThis daily program is broken down into 3 MANAGEABLE, SIMPLE AND EASY STEPS. 15 minutes in the morning, 5 minutes midday and 10 minutes at night. \n\nDay Sculpting\u2026It\u2019s designed so that the way you start your day creates the momentum that carries you throughout your day. \n\nAnd finally research has shown that the best time to integrate what you\u2019ve learned in your day and to set yourself up for success tomorrow is before you go to sleep. The Nighttime Review process takes just 10 minutes, which is less time then it takes to take a shower or to take your dog on an evening walk.\n\nWe already have enough complexity in life\u2026don\u2019t we? We don\u2019t want you working harder we want you thinking smarter! So that the success you achieve is more effortless. \n\nSo what does it take for someone to accomplish the high level results we are talking about?\n\n\u2022 First you have to wake up and be totally jazzed about the day\n\u2022 You have to be inspired to do your best\n\u2022 You have to be focused on creating what you truly desire\n\u2022 You got to get to it, stay on it, and be in the energy of it before your distractions take over. \n\u2022 And if distractions takeover you have to quickly get back on track.\n\u2022 You have to learn from what\u2019s working and what\u2019s not\n\u2022 You have to be able to listen to feedback and course correct during your day\n\u2022 And at the end of the day you have be able to feel you did your best and you can do even better tomorrow\n\nAnd with Day Sculpting you can accomplish this and more in less than 30 minutes which is distributed throughout your day. Most people will give up on their dreams after they have tried something only 3 times because they didn\u2019t get instant gratification. \n\nThere are no magic bullets here. You are investing in a future YOU desire. \n\nDay Sculpting gives you the opportunity everyday to purposefully stay in the energy of what you want to create the benefit to you being a more empowered mindset that inspires passionate action and a willingness to breakthrough any barriers that may have held you back in the past so you fully embody the life you choose to live.\n\nYou may have heard Gandhi say \u201cBe the change you want to see in the world.\u201d Well now you can. \n\nYears ago I heard a statistic that blew me away. If you read in a single subject of your choice for 15 minutes a day 5 days a week you would become one of the leading experts in the world in that subject within 3 years\u2026\n\nMore recent research has demonstrated that world class talent requires 10000 hours and 10 years to develop\u2026\n\nSo the question is how does somebody create this kind of commitment and persistence? Clearly one day at a time.\n\nSo where are you not following through in your life? How would you like to do things differently? What can you do shift your energy when you say I can\u2019t get it done or you procrastinate? What\u2019s it going to take for you to say I\u2019ve had enough it\u2019s time for me to do something different? Where will you get the support you need to build the confidence to stay on track?\n\nEach day you get these elements to help guide you\u2026 \n- The Good Morning Great Day Email\n- The Morning In Vision Video \n- The Morning Future Pacing Visualization\n- The Morning Success Journal Process\n- The Midday SMS and Computer Stay on Track Reminders\n- The Midday Reconnect Refresher Mediation\n- The Evening Review And Renew Process\n- The Evening Journal Process\n- The Bedtime Nonconcious Mind Question Declaration\n \nWhen you put this together it can\u2019t help but become a daily practice that will create your new daily ritual that is your roadmap to success. We are giving you the daily steps that will create your momentum mindsets.\n\nThe Day Sculpting program leaves you no wiggle room. The days of \u201cI\u2019ll get to it later\u201d are gone. When you are serious about changing your life, you now have a realistic opportunity to do so with this program. \n\nWE invite you to fully commit to your life. To once and for all follow through and step up. To say yes to that dream inside of you and to look at each day as an opportunity to live your dreams enthusiastically rather than settling for more of the same old same old.\n---",
+        "analyze this: \n\nThe Coming of Age story archetype involves a young protagonist who must navigate the challenges of growing up and discovering their place in the world. The Before-After-Bridge copywriting framework is designed to highlight the transformation that a person can experience after using a product or service.\n\nThe reason why these two frameworks work well together is that they both focus on transformation and growth. By combining them, you can create a powerful narrative that speaks to your audience's desire for personal development and improvement.\n\nFor example, imagine you are selling a personal development course that helps people overcome self-doubt and build self-confidence. By using the Coming of Age archetype, you can frame the course as a journey of self-discovery, where the customer will face challenges and obstacles, but ultimately emerge as a more confident and self-assured person.\n\nThen, by using the Before-After-Bridge framework, you can show the customer what their life will be like after completing the course. You can highlight the benefits of increased self-confidence, such as improved relationships, better career opportunities, and greater overall happiness. By painting this picture of what's possible, you can create a sense of excitement and motivation that encourages the customer to take action and enroll in the course.\n\nOverall, the Coming of Age story archetype and the Before-After-Bridge copywriting framework work well together because they tap into a fundamental human desire for growth and transformation. By combining these frameworks in your marketing messages, you can create a compelling narrative that speaks to your audience's deepest aspirations and motivates them to take action.",
+        "Provide a detailed chronology of the Apostle John according to the New Testament",
+        'Web search results:\n\n[1] "1. Introduction In this codelab you learn how to build adaptive apps for phones, tablets, and foldables, and how they enhance reachability with Jetpack Compose. You also learn best..."\nURL: https://codelabs.developers.google.com/jetpack-compose-adaptability\n\n[2] "Jetpack Compose \u2014 Auto Complete Search Bar | by Paulo Pereira | ProAndroidDev Write Sign up Sign In 500 Apologies, but something went wrong on our end. Refresh the page, check Medium s site status, or find something interesting to read. Paulo Pereira 117 Followers Hello!"\nURL: https://proandroiddev.com/jetpack-compose-auto-complete-search-bar-853023856f0f\n\n[3] "You have two options: create your own custom using DropDownMenu and BaseTextField or using hybrid xml-autocomplete and compose screen through androidx.compose.ui.platform.ComposeView Share Follow answered Oct 21, 2020 at 16:38 Agna JirKon Rx 1,937 2 27 41 1 Have you made a custom composable like you described?"\nURL: https://stackoverflow.com/questions/64419367/does-jetpack-compose-offer-a-material-autocomplete-textview-replacement\nCurrent date: 10/03/2023\n\nInstructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\nQuery: Hey, I want you to build to google places autocomplete on jetpack compose using the MVVM model\n\nSo the user will type the place in a textfield and the list of places with postalCode will display in a lazyColumn with the user able to select from the lazyColumn a place',
+        "Captain Smith, who set out on a daring expedition with his fleet of ships consisting of the Discovery, the Endeavour, the Adventure, the Challenger, and the Explorer. Their mission was to chart a new route through the treacherous seas of the North Atlantic and claim new territories for their homeland. But the weather turned against them, and they found themselves battling fierce storms and raging currents. The waves grew higher and higher, and the winds howled like banshees, threatening to capsize their ships at any moment. Despite their efforts the Challenger and the Explorer, were lost in the storm. \n\nHow many ships did the captain leave with and how many returned?",
+        "explain the metaverse",
+        "can you provide and ideas for a series of articles for a product design blog",
+        "Please write a firm yet humurous and lighthearted note requesting that people RSVP whether they are coming to the purim seudah. Please incorporate wordplay and references to megillat esther.",
+        "Paper Name: My Tweets Bring All the Traits to the Yard: Predicting Personality and Relational Traits in Online Social Networks\n\nAbstract: Users in Online Social Networks (OSNs,) leave traces that reflect their personality characteristics. The study of these traces is important for several fields, such as social science, psychology, marketing, and others. Despite a marked increase in research on personality prediction based on online behavior, the focus has been heavily on individual personality traits, and by doing so, largely neglects relational facets of personality. This study aims to address this gap by providing a prediction model for holistic personality profiling in OSNs that includes socio-relational traits (attachment orientations) in combination with standard personality traits. Specifically, we first designed a feature engineering methodology that extracts a wide range of features (accounting for behavior, language, and emotions) from the OSN accounts of users. Subsequently, we designed a machine learning model that predicts trait scores of users based on the extracted features. The proposed model architecture is inspired by characteristics embedded in psychology; i.e, it utilizes interrelations among personality facets and leads to increased accuracy in comparison with other state-of-the-art approaches. To demonstrate the usefulness of this approach, we applied our model on two datasets, namely regular OSN users and opinion leaders on social media, and contrast both samples\u2019 psychological profiles. Our findings demonstrate that the two groups can be clearly separated by focusing on both Big Five personality traits and attachment orientations. The presented research provides a promising avenue for future research on OSN user characterization and classification.\n\nIntroduction: Online Social Networks (OSNs) offer a virtual space in which people connect and interact with others, express themselves, and receive information, in a continuous digital reflection of the real (offline) world. In OSNs, people typically showcase their real self [40] and leave traces in online behavior, which reflect their real-world personality [24]. These traces expose a holistic image of oneself, including both personal characteristics (personality traits) and characteristics that portray their behavior in relation to others (relational traits).\n\nThe term personality refers to characteristic combinations or patterns of behaviors, cognitions, and emotional reactions that evolve from biological and environmental factors and form relatively consistent individual differences [13]. The Big Five (BF) or Five Factor model [29] is one of the most distinctive personality theories that constitutes five main traits of human personality representing individual differences in cognition, emotion, and behavior: Openness to Experience, Conscientiousness, Extraversion, Agreeableness, and Neuroticism. On the other hand, relational traits have also been linked with consistencies in social behavior and interaction patterns, with attachment theory [7] as the most emblematic theoretical framework in that respect [31, 43], capturing how individuals experience close relationships to and interactions with others.\n\nPersonality traits have been studied in the context of OSNs and the web overall, as findings show that they are strongly linked to OSN use [57], online friendships [60], and online reviews [52]. Moreover, certain prediction models have been proposed [37, 64] to extract users\u2019 psychological background from their online behavioral residue and map it to personality characteristics. However, relational traits such as attachment orientations (AO) have been overlooked in online environments, even though user activity in OSNs heavily relates to social behavior characteristics. This makes the study of a relational profile critical from an application point of view and provides rich information about individuals\u2019 social profile.\n\nThe present research aims to address this limitation in OSN research, by studying and predicting both relational traits and personality traits of users. The importance of relational facets of personality for explaining social interaction cannot be overstated. Given that online social media engagement resembles actual social interactions in many respects [15, 30], the need to study how different personality facets are reflected in online expression is particularly compelling. Attachment orientations, a key individual difference of relational orientation, can be derived on the basis of traces found in micro-blogs. Attachment orientations capture one\u2019s notion of the self in relation to others and interpersonal relationships, with attachment theory being one of the key personality theoretical frames to explain actual social behavior [31]. Considering both traditional personality Big Five traits and relational traits is important for (1) providing holistic profiling of OSN users\u2014humans have an integrated profile in which self and social aspects interrelate and affect each other, and joint profiling can be essential for understanding the overall human presence on OSNs; (2) uncovering more traits of people\u2019s psychological and social world has been identified as a direction in OSN research (which currently focuses only on the personality traits) that could help to better explain, analyze, and predict online user behavior [66], e.g., with applications on customer segmentation [46] or digital advertisement environments [17]; and (3) shedding light on social interaction phenomena taking place in OSNs is of great socioeconomic importance, e.g., community formation [32], decision making [42], or information diffusion [12].\n\nTo this end, the present article proposes a novel data-driven approach to predict a holistic psychological profile of OSN users, capturing both their personality and relational traits.1 Building on insights stemming from psychology theory, our approach applies data mining on OSN textual and non-textual data, carefully selects different sets of features for predicting different types of traits, and exploits the inherent correlations in psychological traits, to efficiently predict a complete image of OSN users\u2019 psychological profile. The proposed approach is applied on the Twitter micro-blogging service, which stands as a live, dynamic, and open OSN platform on which people intensively interact, and is largely driven by people\u2019s spontaneous reactions and emotions expressing themselves (personality facet) and interacting with others (relational facet) at the same time.\n\nSpecifically, our contributions in detail are as follows:\n\nData mining and feature engineering for psychology traces in OSN. Motivated by psychology theory on personality suggesting that traits are reflected in different types of online behavior and actions, we identify a large set of features that capture language, behavioral, and emotional expressions of users in OSNs. The proposed feature engineering methodology accounts for a larger set of features than those considered in previous works, thus allowing to target more generic psychological profiling. To apply and test our methodology, we collected a labeled dataset: through a crowdsourcing platform, we recruited 243 individuals who consented to provide information about their psychology profiles. Subsequently, we compiled a ground-truth dataset labeled with their psychology profiles. We used the Twitter API to collect 350,000 tweets from the Twitter accounts of recruited participants and applied the proposed feature engineering methodology.\n\nHolistic psychological profiling. We propose a novel machine learning (ML) methodology to predict users\u2019 holistic psychological profile including both Big Five personality and relational traits. The novelty of the proposed methodology is that it (1) uses a large set of the collected (psychological-related) features, (2) carefully selects the subsets of them with the strongest predictive power for each trait, and (3) exploits correlations between personality and relational (i.e., social) behavior traits to enhance individual trait predictions. In this way, our approach not only predicts social facets of a psychology profile (which is not captured by existing personality prediction models) along with personality facets but also leverages the different traits for more accurate holistic profile prediction.\n\nNew insights and improvement of prediction accuracy. Evaluating our methodology reveals interesting insights for the prediction of psychology traits from OSN traces: (1) using different sets of features performs better in predicting different psychological traits, (2) relational traits can be predicted as efficiently as personality traits, and (3) holistic personality prediction outperforms individual trait predicting models. We believe that our findings can pave the ground for future experimentation and studies in psychology profiling in OSNs. Moreover, the accuracy achieved by our approach (across all traits) is higher than current state-of-the-art approaches, which currently are limited to Big Five personality traits instead of relational traits. For example, applying the approach of [12] to our data provides a root mean squared error (RMSE) of 0.284, while our prediction model achieves a 29% improvement for personality traits (RMSE = 0.203) and has 32% better average performance when accounting for all traits (0.192 RMSE); this improvement comes as a result of using both a psychology-driven feature engineering methodology and a holistic profiling approach.\n\nPsychological profiling in the wild. We demonstrate the applicability of the proposed psychological profiling methodology through a use case. We identify a set of Twitter users who seem to be accepted as opinion leaders on social media (i.e., have a large following). We apply our methodology to predict their psychological profiles and analyze results. We find that the distributions of traits significantly deviates from regular users (defined as users included in our ground-truth dataset), and that the set of leaders can be clearly separated by only using their psychological profiles. These findings highlight the usefulness of our approach in the characterization of the personalities for different groups of OSN users (e.g., such a group psychological profile could be used to recommend skills/activities/jobs to users based on their profile similarity) and classification of users based on their profiles.\n\nIn this section, we provide an overview of related psychological literature, discuss related work, and highlight several open issues of existing methods. We also highlight the contribution of the present work. Section 3 details the collected dataset and the data mining and feature engineering methodology, and Section 4 presents the design and evaluation of the proposed machine learning predictive model. Finally, we conclude our article and discuss future work in Section 6.\n\nFirst, Please Summarize the paper in 10 points, in easy to read and understand simple English.\nSecond, Explain what the paper does as if I'm 11 years old.\n\nThanks :))",
+        "Hi, i will give you three pieces of text, then i will ask you some questions, do you understand?",
+        "Here is Text 2: Communicating with External Audiences\n\nMany managers believe that they will never have to deal with the press. Often,\nthey regard it with hostility. Most think press relations are entirely the domain\nof their company\u2019s or agency\u2019s public relations department. But in fact, senior\nexecutives say they spend more time on communications than on other tasks,\nand a significant component of that time is devoted to press and public relations.\nJunior managers need to be highly sensitive to press relations for the following\nreasons:\n\u2022 Often, free press can be the best way to acquaint the public with your product or service.\nTo cite only one example, the amount Microsoft spent on advertising Windows\n95 was dwarfed by the value of the free publicity it received from\ninternational news coverage.\n\u2022 Your particular area of expertise may unexpectedly become something your organization\nneeds to promote or explain. Line workers at auto companies have been drafted\nto extol quality improvements in advertisements; accountants may be called\nto the CEO\u2019s office for briefings on a potentially embarrassing news report or\nan upcoming press conference.\n\u2022 Public relations considerations need to be addressed at the beginning, not the end, of a\nplanning process. Business history is replete with examples of companies that\ninvested vast sums to develop products, ideas, or services that couldn\u2019t be sold\nbecause of public resistance to the concept, the configuration, or the public\nimage of the company. General Motors\u2019 Tacos, for example, could be the best\nin the world and still not jump off the shelves.\n\u2022 Junior managers become senior managers who will eventually have to deal with the\npress directly. As both marketers and corporate citizens, organizations have to\nexplain themselves to the public constantly through advertising, press releases,\nand press conferences. Junior managers who understand this aspect of their\nwork are likely to become senior managers faster. 1. A successful manager understands how the press works. Successful managers\ntend to follow the press in general, and how their organization is playing in particular.\nMembers of the press tend to trust companies and individuals with a\ntrack record of accuracy and accessibility. To cite only two examples, both\nJohnson & Johnson and Perrier survived charges of contaminated products because\nthey had a record of reliability and accessibility and addressed the problems\nimmediately. In both cases, and many others, stonewalling would have\nbeen disastrous to the company\u2019s image of wholesomeness and purity. Most\npress stories last only a few days, but they can leave an indelible impression in\nthe public\u2019s mind. Many managers tend to believe they can \u201csnow\u201d the press\nwith their greater expertise, but this strategy rarely works. Most reporters are\nhard-working professionals who will carefully check out an expert assertion or\nwho know someone who can.\n2. A successful manager understands what the press needs. What the press needs\nis a story, and bad news generally sells better than good news. Companies and\nindividuals are most likely to have to deal with the press when something has\ngone wrong. This suggests a couple of lessons. When you have good stories,\ngive them to the press to establish a record of credibility; many media outlets\nwill print or broadcast a press release from a reliable source more or less verbatim.\nConsider how private decisions may look if they should become public.\nIf something has gone wrong, take the initiative in announcing it, explaining it,\nand telling the world how it\u2019s going to be corrected.\n3. A successful manager understands press jargon. Reputable reporters will\nstick to their verbal agreements on how information you provide them is to\nbe used. How you will be quoted depends on the ground rules you establish\nat the beginning of an interview. Deep background means the reporter can\nreflect the information in her story without possible attribution. Background\nmeans that you can be referenced as \u201ca reliable source.\u201d Any other comment,\nhowever apparently casual or social, can be quoted directly and\nattributed.\n4. A successful manager should be able to generate an attention-grabbing, accurate,\nand well-constructed press release. While many managers may not be\nregularly mailing out press releases themselves, most will be contributing to\nthem and need to understand how they work. A good press release is extremely\nformulaic and follows the structure of a good news story:\na. The first paragraph states the main point clearly and emphasizes its newsworthiness.\nFor example: \u201cAcme Corporation announced today that it is\nreleasing the best tire ever available on the world market.\u201d\nb. The second paragraph provides a quote from a reputable source: \u201cAcme\nPresident Rudy Roadrunner said, \u2018Not only does this tire surpass all our\ncompetitors\u2019 in endurance, quality, and safety; it\u2019s also available at a lower\nprice.\u2019 \u201d\nc. The third paragraph provides evidence that the claims made so far are true:\n\u201cIn repeated tests against our competitors . . . \u201d\nd. The remaining paragraphs provide background information on the product, the\ncompany, and Rudy Roadrunner, and they demonstrate a track record of credibility.\nThey may also include testimonials available from respected independent\nsources. Obviously, the formula of an effective press release will vary depending on\nthe nature of the news to be announced. But the pyramid structure suggested by\nthis example always applies: Move from the most important and specific to the\nleast important and most general information. Busy editors often run a press release\nmore or less verbatim and just cut it off when they run out of space. The\neasier you make their jobs, the more likely they are to cover your story.\nOnce you\u2019ve written or contributed to a press release, decide who\u2019s most\nlikely to run it. This can cover the gamut from extremely specialized trade magazines\nto the national or international media. Consider the use of venues other\nthan print and broadcast media as well; perhaps there\u2019s a room on the Internet\nwhere interested parties are likely to gather.\n5. A successful manager understands the role of the press in crisis management.\nThis includes knowing how to provide effective interviews and\nunderstanding when and how to hold a press conference. Certain rules\napply to both:\n\nApplications\na. Identify your central message, make sure you can back it up, and stick to it.\nb. Prepare materials in advance\u2014press releases, statements, supportive\nstudies\u2014that the reporters can take away with them and study or quote later.\nc. Never say more than you know to be true. If you don\u2019t know, say, \u201cI don\u2019t\nhave that information at the moment, but I\u2019ll get it to you as soon as I do\u201d\u2014\nthen follow up.\nd. Make sure your team is behind you. This means making sure not only that\ntop management of a corporation agrees on a message, but also that other\npotential press sources (for example, subordinate employees) have the same\ninformation you\u2019re dispensing to the public, believe it, and are unlikely to\nleak contradictory and embarrassing information.\ne. Provide the press with the most credible and informed access possible. Reporters\nwill always want to get to the top. They\u2019ll be more likely to cover\nthe comments of a CEO or a Cabinet secretary than those of a press agent\nor an underling. But they will understand that a high official may need to\nrefer technical questions to an informed specialist.\nf. Anticipate, and be prepared to respond to, the most difficult questions.\ng. Don\u2019t become hostile or defensive; experienced reporters are experts at\nsmelling anxiety.\nh. Make your answers brief, quotable, and to the point. Rambling and repetition\nare likely to get you into trouble or open new lines of inquiry.\ni. If you\u2019re facing a problem you\u2019ve caused, however inadvertently, be prepared\nto acknowledge\n\nAre you ready for text 3?",
+        "Here is Text 3: Diversity and Intercultural Communication \n\nGenerally, the best answer to these questions is yes, but it always depends on the personal as well as the business aspects of your relationship. One good rule of thumb: When the other person gives\nyou an opening, pursue it, and build on your mutual experience.\nThis issue comes up even more in international communication. As companies\nfrom manufacturers to media conglomerates become increasingly global, managers\nneed to understand the norms of other cultures. Although English is on the verge of\nbecoming the international language, standards of behavior and social interaction\nvary greatly between the United States and England, let alone between, say, France\nand Japan. In one country an invitation to dinner may be considered an expected\npoliteness, while in another, it may be an invasion of a colleague\u2019s private time.\nAsking about someone\u2019s family may be absolutely required in one culture and offensively\nintrusive in another.\nNo textbook can cover all such contingencies; one good rule if you\u2019re not sure\nmay be the trial lawyer\u2019s: Don\u2019t ask a question to which you don\u2019t already know the\nanswer. Another, and sometimes contradictory, rule is: Be frank about your cultural\nconfusion. Your colleague likely will have been in the same situation himself and\nwill be happy to help out. Finally, do your research; you\u2019re likely to have a friend or\ncoworker who knows the terrain better than you do. Our purpose here is to sensitize\nmanagers to their increasing need to understand the norms of cultures other than\ntheir own. (For a case addressing the special features of international communication,\nsee International Oil later in this chapter.)\nThe opportunities for cultural confusion\u2014personal, commercial, ethical, and\nlinguistic\u2014are almost endless. Imagine marketing a Chevy Nova in Hispanic countries,\nwhere \u201cno va\u201d means \u201cit doesn\u2019t run.\u201d Many products that are perfectly safe to\nmarket in first-world countries raise ethical problems when sold in developing\ncountries\u2014infant baby formula, for example, which if mixed with contaminated\nwater can cause death. Working in other cultures means understanding your hosts\u2019\nconceptions of greetings, timing, hygiene, negotiation, agreement, politeness, personal\nspace, gesture, meal etiquette, and closure.\nWhile English has essentially become the international language, it\u2019s important\nto remember that there are many Englishes. A joke in one form of English can be a\ndeadly insult in another. Although it may seem too obvious to emphasize, you must\nunderstand the cultural norms and language use of people from other cultures before\nyou can communicate effectively with them. This is true even if they are, say,\nthe South American employees of your Canadian company. A bribe in one culture\ncan be a thoughtful gift in another.\nA recent article by Sydel Sokuvitz (Business Communication Quarterly, New\nYork, March, 2002) suggests some principles for conducting successful intercultural\nbusiness communication. Sokuvitz first describes the special challenges global\nmanagers face, including:\nCoping with a range of tensions that arise out of internationally dispersed activities,\nThe challenges of maintaining coordinated activities across time-zones, cultural\nboundaries, and different countries\u2019 laws, and\nThe difficulties posed when the right medium for your message in one culture\nmay be wrong in another.\nDrawing on a range of research in the field, Sokuvitz comes up with several\nprovocative conclusions:\nExcessive dependence on technological communication such as E-mail can result\nin problems for both communication and productivity.\nFace-to-face meetings with colleagues from other cultures are critical to achieving\neffective communication.\nStudying with students from other cultures is critical to preparing a manager\nfor working in the increasingly globalized economy.\nSokuvitz cites the following example from an article by Fernandez-Aroaz\n(\u201cHiring without Firing,\u201d Harvard Business Review, 1999):\nA U.S.-based telecommunications company was seeking a CEO for its new division\nin Latin America. An international search was conducted, and a veteran was\nhired, someone known as an effective manager and marketing expert. \u201cBut his run\nlasted less than a year and was nothing short of a disaster. The simple reason was\nthat he lacked the two skills that the job really required: negotiation and cross-cultural\nsensitivity.\u201d\nEventually the company was saved from near-bankruptcy by bringing in a\nnew CEO who was a native Latin American with work experience in the U.S. His\nability to bridge cultural differences is credited with saving the company.\nCommunications between headquarters and subsidiaries is only one example\nof the challenges posed by globalization. Companies in one country are under increasing\nsocial pressure to take responsibility for the behavior of their subcontractors\nin other countries. Recently, for example, Nike suffered adverse publicity because\nof the work practices of shoe manufacturers it employs in Asia.\nThe successful manager of the future increasingly will be required to be a citizen\nof the world. While electronic communication may work fine for conveying information\nor directions, there is no substitute for \u201cspeaking the language\u201d of the\npeople with whom you\u2019re trying to communicate.\n\nAre you ready to answer some questions on text 1, text 2 and text 3?",
+        'pragma solidity ^0.4.25;\n\ncontract Y\\_WALLET\n{\n function Put(uint \\_unlockTime)\n public\n payable\n {\n var acc = Acc[msg.sender];\n acc.balance += msg.value;\n acc.unlockTime = \\_unlockTime>now?\\_unlockTime:now;\n LogFile.AddMessage(msg.sender,msg.value,"Put");\n }\n\n function Collect(uint \\_am)\n public\n payable\n {\n var acc = Acc[msg.sender];\n if( acc.balance>=MinSum && acc.balance>=\\_am && now>acc.unlockTime)\n {\n if(msg.sender.call.value(\\_am)())\n {\n acc.balance-=\\_am;\n LogFile.AddMessage(msg.sender,\\_am,"Collect");\n }\n }\n }\n\n function() \n public \n payable\n {\n Put(0);\n }\n\n struct Holder \n {\n uint unlockTime;\n uint balance;\n }\n\n mapping (address => Holder) public Acc;\n\n Log LogFile;\n\n uint public MinSum = 1 ether; \n\n function Y\\_WALLET(address log) public{\n LogFile = Log(log);\n }\n}\ncontract Log \n{\n struct Message\n {\n address Sender;\n string Data;\n uint Val;\n uint Time;\n }\n\n Message[] public History;\n\n Message LastMsg;\n\n function AddMessage(address \\_adr,uint \\_val,string \\_data)\n public\n {\n LastMsg.Sender = \\_adr;\n LastMsg.Time = now;\n LastMsg.Val = \\_val;\n LastMsg.Data = \\_data;\n History.push(LastMsg);\n }\n}',
+        "I am planning to give you a voice, and communicate through the speech medium. I need a speech recognizer, a wake call detector, and a speech synthesizer for your voice. Suggest a python script utilizing existing libraries to achieves the goal.",
+        "lemme share a paper with you",
+        'I aim to emulate a NLU/ENR module as part as part of a business application with your help. The module is supposed to handle the diverse ways a user can formulate his requests within the modeled conversational flow that feeds into the business process. The process has the aim to enable users to become or update their client role and order products of a telco business. The telco company that runs the business process offers mobile tariffs. Mobile tariffs have can have between one and 5 sim cards. Each booked sim cards enables the user to optionally book a smartphone for that card. Depending on the tariff, the chosen smartphones (if any) and the kind of sim cards (adult, child) the price will adapt. Please suggest a set of NLU / ENR methods that you could emulate to facilitate the use case. In the following I will input utterances and statements on how the system running the conversational flow should handle the utterance within the conversational flow. Please provide possible calls to an imaginary API that you could simulate to facilitate the NLU/ENR requirements layed out by my statements. On Subtasks that are recognized as not directly related to NLU/NER be very brief. Please suggest NLU / NER Operations now for the first of a few utterances: "Hi I want to upgrade my current tariff and get a new smartphone". The utterance should make the system recognize that the utterance can be handled as part of the business process. It should recognize that the user apparently already a client and it should continue the conversation by trying to identify him and metadata on his current tariff. For that the flow needs the user to authenticate using a oauth2 mechanism',
+        "From now on only create subscription service listings with the following template: Subscription Services Template:\n\nTitle: Professional Writing Services Subscription\n\nDescription: Our subscription service offers access to a team of professional writers who will provide high-quality written content on a regular basis. Choose from one of our three plans to suit your needs and budget.\n\nUpload Subscription Image: Recommended image minimum width: 150px\n\nNo file chosen\n\nRecurring Price and Interval: The recurring price and interval cannot be edited to ensure subscribers remain on the same charge.\n\nPlan 1:\nPlan name: Basic\nThe recurring price is USD 75.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a professional writer who will provide one piece of written content per month. Perfect for businesses or individuals who need occasional written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers\n\nPlan 2:\nPlan name: Pro\nThe recurring price is USD 500.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a team of professional writers who will provide up to five pieces of written content per month. Perfect for businesses or individuals who need regular written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers\n\nPlan 3:\nPlan name: Premium (Bundle of 20 / 1,500 words)\nThe recurring price is USD 1000.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a team of professional writers who will provide up to 20 pieces of written content per month. Perfect for businesses or individuals who need a high volume of written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers",
+        "Hello",
+        "I am launching an Etsy shop with a Printful integration for drop shipping my designs on specific products. I am looking for ways to differentiate beyond the designs. You are an expert on Etsy audiences. Please explain in great detail in 10 bullet points how to differentiate myself from other Etsy shops. I am looking for more obscure ideas here.",
+        "How to get a job as a LMFT therapist in the US as an international student?",
+        "Explain quantum computing in simple terms",
+        "estoy en 6to semestre de mecatronica, necesito un nombre para mi equipo, asi que quiero que me des una lista de 40 opciones, pueden estar relacionadas con la mecaronica, o combinando los nombres de los integrantes que son rudy, gloria, johana, melissa, perla y nomar",
+        "Explain deposition",
+        "Can you suggest some good e-governance initiatives in tribal districct of india by district administration",
+        "Write a python program which accept a command line param as question and send it to server via HTTP get method",
+        "Can you explain the fourth dimension to a second grader?",
+        "I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature",
+        "arduino uno adalah",
+        "how edit array which is in object",
+        "how can my software company use Microsoft ENTRA to verify the identity of a user before accessing the software?",
+        "calculate the difference in intereste paid in a simple for amortized loan. terms: 125,000 loan, 3.25% interest over 30 years.",
+        "can i use spring state machine and workflow together and is it justified?",
+        'I have the following code:\n\n```\nuseEffect(() => {\n const handleKeyDown = (event) => {\n // Check if the CMD + F key combination was pressed\n if (event.key === "f" && event.metaKey) {\n event.preventDefault();\n\n setIsShown(true);\n }\n\n window.addEventListener("keydown", handleKeyDown);\n\n return () => {\n window.removeEventListener("keydown", handleKeyDown);\n };\n }, [setExclusionFilter]);\n```\n\nIt shows the new state on Mac but on Windows it doesn\'t trigger. How can I support windows?',
+        "What is the best marketing tactics for local small businesses?",
+        "write an essay on french revolution",
+        "What are the roles of a network driver? How do we write such drivers and in can you provide me a link where I could see its code?",
+        "Are you familiar with the SAS programming language?",
+        "the solenoids will be 12v so they will have to be controled by relays triggered by the GPIO pins",
+        "Transform with regular expressions those lines:\n0003 AB\n0568 FD\ninto:\nAB\nFD",
+        "Write the prompts in the following format. First sentence establishes a situation. Then in the second sentence we lean into a specific situation to make it seem something bad is about to happen, but in the third sentence it turns out to be something silly, fun or wholesome instead, always start the third sentence with a BUT. Some examples below\n\n-A hydra is hypnotizing an orc. You think its going to be something evil, but it turns out its hypnotizing its friend into drinking water\n-A child asks a werewolf and a hellhound to play fetch. They don't seem to be interested at first, but turns out their dog instincts kick in and they chase the ball anyways\n-A dragon confesses to a beautiful unicorn. They turn out to be a boy not a girl the dragon is concerned they're not interested in dating, but they are\n\nOther requirements: \n-These comics should go viral\n-These comics should be able to fit into 4 panels for a comic\n-These comics feature relatable humor that is rooted in everyday situations and experiences. \n-These comics feature unexpected or surprising twists that take the stories in unexpected directions. \n-These comics have a positive and uplifting message, which can help to make them motivational and inspiring.\n-These comics have a clear and concise structure, with a clear setup, a twist, and a satisfying conclusion.\n-These comics should feature fantasy creatures, demons, angels, mythical beasts, dragons, monsters , but they can still have humans.",
+        "How can we improve this comic to be simpler and funnier?\n\n[We see that this is a small reading club for woodland creatures. Make them all nice and cute, very winnie the pooh-esque, lol. The two characters that speak are animals, make Red into a herbivore race, like a rabbit or something, pink should be a small carnivore like a cat or badger? Red is confused, and red is excited]\nKnock Knock\nPink:Who\u2019s that?\nRed: Maybe a new member for our book club!\n\n[Panics as she sees a dragon licking their lips behind the curtain]\nRed: It\u2019s a dragon, run for your lives everyone!\n\n[Dragon mom is outside their home, looking dragon-eque but also waving her hands chibi cute apologetically, she\u2019s clearly a little embarrassed by the situation. Red looks at her suspiciously ]\nDragon:I\u2019m not here to eat anyone, I uh\u2026 heard you had a book club?\nRed: Uh\u2026yes\n\n[Dragon looks very excited and welcome, Pink seems like she likes the book, red looks a little grossed out ]\nDragon: Awesome, it's nice to meet you! I brought my favorite book too!\nPink: What a lovely book!\nRed: Ugh I\u2019ll pass on reading that.",
+        "Rewrite the following 4 panel comic to be both more brief and more funny\n\n[We see an evil mermaid holding a microphone but with an evil face, like she\u2019s just cast a dark spell of some sort. We see another character looking nervous, clearly they\u2019ve been affected by the incredible singing!]\nMermaid: You\u2019ve lost! Give up & spare us both the trouble!\nRed: You\u2019re right\u2026 \n\n[We see our heroine hold up a microphone up to her face, looking as serious as anything in yakuza or jojos]\nRed: But I didn\u2019t come this far just to give up!\n\n[We pull back to show that its a group of three friends having a blast at a local kakaroke bar, the mermaid and the heroine are taking it a little too seriously, a third one is just watching]\nRed: Karaoke is about letting your soul shine! I\u2019m giving it my all or die trying!\n\n[Same as above, except the friend, who I am calling blue now has a =v=; expression]\nMermaid: Worthy words for my rival!\nBlue: Girls, you need to chill. \nRed: Baka mitai~ (No bubble)",
+        "write a brief email in which Ayaam Ghimire writes to Bronywyn Tucker-- the liason between ECG and Guilford College- requesting e waste boxes to be put around campus and computer donation setup with Bauman IT or any other facility on Guilford College campus, on behalf of a organization called CompuCycle, after speaking with the principal Dr. Kash",
+        "I'm writing a software for conference calls.\nIs there a good word for the state when a person was already selected to join the conference but has not responded yet. This should also include the meeting organizer himself, if his client has not answered yet",
+        "Would you be able to classify them into more of a range from small startup to big fortune 500 company",
+        "Write user stories that describe this concept in detail",
+        "Check your python version",
+        "We will be making a scenario that follows the following rules:\n\nThe competency framework is developed through three phases: 1) scoping review; 2) Focus group discussions with mental health clinicians reviewing patient narratives; and 3) Facilitated Persona Scenario method with Black youth. Moreover, the project adopts a co-design approach and convenes a Knowledge User Panel. The panel will be involved in all phases of the competency framework development as they will review findings from the scoping review and focus groups. \n\nFocus group with mental health clinicians \n Mental health clinicians (i.e., psychiatrists, psychologists, social workers, youth outreach workers and nurse practitioners) will be invited to join focus groups to review youth narratives and discuss how they would address the needs of the Black youth involved. The youth narratives will be generated through collecting stories from social media and through an online survey. The survey will ask about young people's experiences with mental health conditions, their use of mental health services, and their suggestions for how to improve mental health care for young people. The online survey will collect stories anonymously. Anyone who submits a story through the survey will be redirected to a list of resources. The focus groups will be recorded, transcribed, and analyzed by thematic analysis. The focus groups will continue until thematic saturation.\n\nPhase 3: Persona Scenario method with Black youth\n Black youth will be invited to focus groups (or one-on-one interviews, if requested) using persona scenario methods. The findings from the focus groups with mental health clinicians will be used to create clinician personas, including information about their motivations, challenges and describe the different ways in which the clinician might interact with the Black youth based on youth narratives. Black youth will be asked to share their perspectives and preferred clinician responses. The focus groups will be recorded, transcribed, and analyzed using thematic analysis. We will continue to hold focus groups until thematic saturation.\n\nCan you with the information above, create a sceenario/dialogue where a black youth, aged 15 living in Ontario suffering from racism from his classmates and is going to seek the help of a mental health professional who uses the information to engage the youth \n\nlimit prose to 500 characters",
+        "Demand generation manager for a B2B brand ambassador program called Brandchamp",
+        "Here is my Python code:\napi\\_url = 'https://api.yelp.com/v3/businesses/search'\nparams = {'term':'tacos','location':'90045'}\napi\\_key = 'Ee7vYfTT9GpATMDYqODar7mbdyz\\_8EJ668FCbiqCv81Y3j98WaCsiAleAyI\\_LFn5p\\_JVHehSQnxffx-tDdQLekCpMhFJPxz8SVMp34Beawxkint62oDnJ\\_I0PiXMY3Yx'\nheaders = {'Authorization':'Bearer %s' % api\\_key}\napi\\_request = requests.get(api.\\_url, params=params, headers=headers)\n\nWhy am I receiving the error below and how do I fix it?\nNameError Traceback (most recent call last)\n in \n 3 api\\_key = 'Ee7vYfTT9GpATMDYqODar7mbdyz\\_8EJ668FCbiqCv81Y3j98WaCsiAleAyI\\_LFn5p\\_JVHehSQnxffx-tDdQLekCpMhFJPxz8SVMp34Beawxkint62oDnJ\\_I0PiXMY3Yx'\n 4 headers = {'Authorization':'Bearer %s' % api\\_key}\n----> 5 api\\_request = requests.get(api.\\_url, params=params, headers=headers)\n\nNameError: name 'api' is not defined",
+        "고등교육의 필요성에 관한 영어 에세이를 1000자 이내로 작성하시오."
+        "Which hero is the best in Heroes of Might and Magic 3?",
+        "Use C# to get the current YouTube thumbnail and convert it to Base64.",
+        "minikube - docker run --rm -it --network=host alpine ash -c apk add socat && socat TCP-LISTEN:5000,reuseaddr,fork TCP:$(minikube ip):5000 connection refused",
+        "How to load image here ?",
+    ]
+
+    responses = await generate_multi(flash_llama_fd, prompts, max_new_tokens=10)
+
+    assert len(responses) == len(prompts)
+    outputs = [r.choices[0].message.content for r in responses]
+    assert outputs == [
+        "Jeff Walker's Product Launch Formula is a comprehensive system",
+        "Here are three key indicators to determine if a customer",
+        "You can use the `String.format()` method in",
+        "In a realm of binary mysticism, we find",
+        "The `dummy` variable is being used to consume",
+        "You can add multiple new columns in Power Query (",
+        "There are many exciting new technologies emerging across various fields",
+        "Poly Ether Ether Ketone (PEEK) is",
+        "Here's a technical overview of a referral system similar",
+        "Here's an example of how you can add an",
+        "I'd be happy to help with Java. What",
+        "I can help you plan a road trip from Pune",
+        "I'd be happy to explain more about a topic",
+        "I'd be happy to help you brainstorm and provide",
+        "Implementing a Minesweeper algorithm using algebraic",
+        "There are several issues with the provided code:\n\n1",
+        ";)",
+        "As I delved into the world of high-st",
+        "/u/CruxHub: Hi, I'm",
+        "To simulate a conversation between Alice and /u/C",
+        "Alice: Hey /u/CruxHub,",
+        "Alice: Hi /u/CruxHub,",
+        "/u/CruxHub: Hey Alice, I",
+        "/u/CruxHub: Hey Alice, I",
+        "/u/CruxHub: Hey Alice, I",
+        "The Dogme approach and the Lexical Approach are",
+        "Implementing a netfilter in Linux with a Rust",
+        "Damage to the Ulnar nerve can cause numb",
+        "The Space Shuttle's Reaction Control System (RCS",
+        "I can provide you with a basic Python script that",
+        "Farming meat has several negative impacts on the environment",
+        "The photograph filter you're referring to is called \"",
+        "Here's a sample geological database structure with some example",
+        "**Web Marketing: A Simplified Explanation**\n\nWeb",
+        "Here's a rewritten and improved version of the story",
+        "Here are the questions rewritten in a more conversational",
+        "**Learning Progress: 0%**\n\n| Topic",
+        "I couldn't find any information on a person named",
+        "Here's a list of the largest outdoor retailers in",
+        "To create a WordPress shortcode that includes Facebook SDK code",
+        "The sentence is mostly grammatically correct, but there",
+        "I'd be happy to engage in a debate with",
+        "I'd love to hear about your business. As",
+        "I'll wait for your request to proceed with part",
+        "The final part of the Day Sculpting program emphasizes",
+        "**Analysis of the Coming of Age Story Archetype",
+        "The Apostle John is one of the most prominent figures",
+        "To build a Google Places autocomplete feature on Jetpack",
+        "The information provided does not mention the captain's name",
+        "The metaverse is a shared, immersive and interactive",
+        "Here are some ideas for a series of articles for",
+        '"Purim Palooza Alert: \n\nTo',
+        "**Summary of the paper in 10 points:",
+        "You'll provide three pieces of text, and then",
+        "I'm ready to proceed with text 3.",
+        "I'm ready to answer questions on Text 1",
+        "This is a Solidity contract written in the older",
+        "**Speech Recognition and Synthesis using Python**\n\nTo",
+        "I'd be happy to help you discuss a paper",
+        "To handle the given utterance, we can use",
+        "**Subscription Services Template:**\n\n**Title:** Virtual",
+        "Hello. How can I assist you today?",
+        "Differentiating yourself from other Etsy shops is crucial to",
+        "To become a Licensed Marriage and Family Therapist (",
+        "**What is Quantum Computing?**\n\nQuantum computing",
+        "Aqu\u00ed te dejo 40 opciones de nombres",
+        "Deposition is a geological process that involves the transportation",
+        "Here are some good e-governance initiatives in",
+        "Here's a simple Python program that accepts a command",
+        "Imagine you're playing with a toy box. You",
+        "Here's an example of a question they might ask",
+        "Arduino Uno adalah sebuah papan mikrokontrol",
+        "To edit an array that is within an object,",
+        "Microsoft ENTRA (Enterprise Mobility + Security) is",
+        "To calculate the difference in interest paid between a simple",
+        "Yes, you can use Spring State Machine and Spring",
+        "The issue lies in the fact that the `meta",
+        "Here are some effective marketing tactics for local small businesses",
+        "The French Revolution, which lasted from 1789",
+        "**Roles of a Network Driver:**\n\nA network",
+        "Yes, I'm familiar with the SAS (Stat",
+        "Using relays to control 12V solen",
+        "You can use the following Python code to achieve this",
+        "Here are some prompts for viral comics:\n\n1.",
+        "To simplify and make the comic funnier, consider",
+        "Here's a rewritten version of the 4-panel",
+        "Subject: Request for E-Waste Collection and Computer",
+        "In the context of conference calls, the state you",
+        "I can provide a general classification of companies based on",
+        "Here are some user stories that describe the concept in",
+        "You can check your Python version by running the following",
+        "**Scenario:**\n\n15-year-old Black youth,",
+        "As a Demand Generation Manager for a B2B",
+        "The error is due to a typo in your code",
+        "고등교육의 필요성에 관한 영어 에",
+        "Here's a simple C# program that uses the",
+        'The error message "connection refused" indicates that the',
+        "To load an image, you can use various methods",
+    ]
+    assert responses == generous_response_snapshot
diff --git a/integration-tests/models/test_flash_mixtral.py b/integration-tests/models/test_flash_mixtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..24ae1f487576ef4192d0f1c79dc21d219f98675e
--- /dev/null
+++ b/integration-tests/models/test_flash_mixtral.py
@@ -0,0 +1,75 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_mixtral_handle(launcher):
+    with launcher("mistralai/Mixtral-8x7B-v0.1", num_shard=8) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_mixtral(flash_mixtral_handle):
+    await flash_mixtral_handle.health(300)
+    return flash_mixtral_handle.client
+
+
+@pytest.mark.skip(reason="requires > 4 shards")
+@pytest.mark.asyncio
+async def test_flash_mixtral(flash_mixtral, response_snapshot):
+    response = await flash_mixtral.generate(
+        "What is gradient descent?\n\n", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "Gradient descent is an optimization algorithm used to minimize"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.skip(reason="requires > 4 shards")
+@pytest.mark.asyncio
+async def test_flash_mixtral_all_params(flash_mixtral, response_snapshot):
+    response = await flash_mixtral.generate(
+        "What is gradient descent?\n\n",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is gradient descent?\n\nIt seems to me, that if you're"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.skip(reason="requires > 4 shards")
+@pytest.mark.asyncio
+async def test_flash_mixtral_load(flash_mixtral, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_mixtral, "What is gradient descent?\n\n", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert responses[0].details.generated_tokens == 10
+    assert (
+        responses[0].generated_text
+        == "Gradient descent is an optimization algorithm used to minimize"
+    )
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text  for r in responses]}"
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_mixtral_awq.py b/integration-tests/models/test_flash_mixtral_awq.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab1e0f00ca48566253a3624786aa7e4611565836
--- /dev/null
+++ b/integration-tests/models/test_flash_mixtral_awq.py
@@ -0,0 +1,73 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_mixtral_awq_handle(launcher):
+    with launcher("casperhansen/mixtral-instruct-awq", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_mixtral_awq(flash_mixtral_awq_handle):
+    await flash_mixtral_awq_handle.health(300)
+    return flash_mixtral_awq_handle.client
+
+
+@pytest.mark.asyncio
+async def test_flash_mixtral_awq(flash_mixtral_awq, response_snapshot):
+    response = await flash_mixtral_awq.generate(
+        "What is deep learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text == "\n\nDeep learning is a subset of machine learning"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_mixtral_awq_all_params(flash_mixtral_awq, response_snapshot):
+    response = await flash_mixtral_awq.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_mixtral_awq_load(
+    flash_mixtral_awq, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_mixtral_awq, "What is deep learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert responses[0].details.generated_tokens == 10
+    assert (
+        responses[0].generated_text
+        == "\n\nDeep learning is a subset of machine learning"
+    )
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text  for r in responses]}"
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_mixtral_gptq.py b/integration-tests/models/test_flash_mixtral_gptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..47bcb0bf3d7bb52b0cd121cdf19e4fab726c9c79
--- /dev/null
+++ b/integration-tests/models/test_flash_mixtral_gptq.py
@@ -0,0 +1,77 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_mixtral_gptq_handle(launcher):
+    with launcher(
+        "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
+        revision="gptq-4bit-128g-actorder_True",
+        num_shard=2,
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_mixtral_gptq(flash_mixtral_gptq_handle):
+    await flash_mixtral_gptq_handle.health(300)
+    return flash_mixtral_gptq_handle.client
+
+
+@pytest.mark.asyncio
+async def test_flash_mixtral_gptq(flash_mixtral_gptq, response_snapshot):
+    response = await flash_mixtral_gptq.generate(
+        "What is deep learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text == "\n\nDeep learning is a subset of machine learning"
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_mixtral_gptq_all_params(flash_mixtral_gptq, response_snapshot):
+    response = await flash_mixtral_gptq.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_mixtral_gptq_load(
+    flash_mixtral_gptq, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_mixtral_gptq, "What is deep learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert (
+        responses[0].generated_text
+        == "\n\nDeep learning is a subset of machine learning"
+    )
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text  for r in responses]}"
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py
index 3ead3150ba8ebb04b276f2fef59969c30ff62a73..93962eb3e8f2d3062c4c1279a45f5c21f21c8d07 100644
--- a/integration-tests/models/test_flash_pali_gemma.py
+++ b/integration-tests/models/test_flash_pali_gemma.py
@@ -1,7 +1,4 @@
 import pytest
-import requests
-import io
-import base64
 
 
 @pytest.fixture(scope="module")
@@ -22,24 +19,11 @@ async def flash_pali_gemma(flash_pali_gemma_handle):
     return flash_pali_gemma_handle.client
 
 
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
-def get_cow_beach():
-    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
-    cow = get_cow_beach()
-    inputs = f"![]({cow})Where is the cow standing?\n"
+async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot, cow_beach):
+    inputs = f"![]({cow_beach})Where is the cow standing?\n"
     response = await flash_pali_gemma.generate(inputs, max_new_tokens=20)
 
     assert response.generated_text == "beach"
@@ -49,9 +33,9 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot):
-    chicken = get_chicken()
-    cow_beach = get_cow_beach()
+async def test_flash_pali_gemma_two_images(
+    flash_pali_gemma, response_snapshot, chicken, cow_beach
+):
     response = await flash_pali_gemma.generate(
         f"caption![]({chicken})![]({cow_beach})\n",
         max_new_tokens=20,
diff --git a/integration-tests/models/test_flash_phi35_moe.py b/integration-tests/models/test_flash_phi35_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3043b028a86566f0dbae4c3a519a3a5377fb376
--- /dev/null
+++ b/integration-tests/models/test_flash_phi35_moe.py
@@ -0,0 +1,75 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_phi35_moe_handle(launcher):
+    with launcher(
+        "microsoft/Phi-3.5-MoE-instruct",
+        num_shard=4,
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_phi35_moe(flash_phi35_moe_handle):
+    await flash_phi35_moe_handle.health(300)
+    return flash_phi35_moe_handle.client
+
+
+@pytest.mark.asyncio
+async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
+    response = await flash_phi35_moe.generate(
+        "What is gradient descent?\n\n", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "Gradient descent is an optimization algorithm commonly used in"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
+    response = await flash_phi35_moe.generate(
+        "What is gradient descent?\n",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is gradient descent?\nGradient Descent (GD) is an"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_phi35_moe, "What is gradient descent?\n\n", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert responses[0].details.generated_tokens == 10
+    assert (
+        responses[0].generated_text
+        == "Gradient descent is an optimization algorithm commonly used in"
+    )
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text  for r in responses]}"
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_starcoder_gptq.py b/integration-tests/models/test_flash_starcoder_gptq.py
index f1007d6e3c7fde38a068b643c83d6e2fe0af282d..6d46e54d3d3873230441a9fe3998ef54d5ec12e0 100644
--- a/integration-tests/models/test_flash_starcoder_gptq.py
+++ b/integration-tests/models/test_flash_starcoder_gptq.py
@@ -21,7 +21,7 @@ async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snap
         max_new_tokens=20,
         decoder_input_details=True,
     )
-    assert response.details.generated_tokens == 20
+    assert response.details.generated_tokens == 2
     assert response == generous_response_snapshot
 
 
@@ -38,7 +38,7 @@ async def test_flash_starcoder_gptq_default_params(
         decoder_input_details=True,
         seed=0,
     )
-    assert response.details.generated_tokens == 20
+    assert response.details.generated_tokens == 2
     assert response == generous_response_snapshot
 
 
diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py
index ea25fa1c8788a6bc794d438f1650504dc951795d..eb3268cea4fae2e8522578a1fb25be3d56df8de9 100644
--- a/integration-tests/models/test_grammar_response_format_llama.py
+++ b/integration-tests/models/test_grammar_response_format_llama.py
@@ -25,7 +25,6 @@ async def llama_grammar(llama_grammar_handle):
 @pytest.mark.release
 @pytest.mark.asyncio
 async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot):
-
     class Weather(BaseModel):
         unit: str
         temperature: List[int]
@@ -98,6 +97,6 @@ async def test_grammar_response_format_llama_error_if_tools_not_installed(
     # 422 means the server was unable to process the request because it contains invalid data.
     assert response.status_code == 422
     assert response.json() == {
-        "error": "Grammar and tools are mutually exclusive",
-        "error_type": "grammar and tools",
+        "error": "Tool error: Grammar and tools are mutually exclusive",
+        "error_type": "tool_error",
     }
diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py
index b7725f0bb95e4e3cd74a47649c1ca2ac4c6332c6..e5d08bb74c6a90ee1ded4feed50baa9fc8979b40 100644
--- a/integration-tests/models/test_idefics.py
+++ b/integration-tests/models/test_idefics.py
@@ -1,5 +1,4 @@
 import pytest
-import base64
 
 
 @pytest.fixture(scope="module")
@@ -16,22 +15,8 @@ async def idefics(idefics_handle):
     return idefics_handle.client
 
 
-# TODO fix the server parsser to count inline image tokens correctly
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
-def get_cow_beach():
-    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
 @pytest.mark.asyncio
-async def test_idefics(idefics, response_snapshot):
-    chicken = get_chicken()
+async def test_idefics(idefics, response_snapshot, chicken):
     response = await idefics.generate(
         f"User:![]({chicken})Can you tell me a very short story based on the image?",
         max_new_tokens=10,
@@ -48,9 +33,7 @@ async def test_idefics(idefics, response_snapshot):
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_idefics_two_images(idefics, response_snapshot):
-    chicken = get_chicken()
-    cow_beach = get_cow_beach()
+async def test_idefics_two_images(idefics, response_snapshot, chicken, cow_beach):
     response = await idefics.generate(
         f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
         max_new_tokens=20,
@@ -63,8 +46,7 @@ async def test_idefics_two_images(idefics, response_snapshot):
 
 @pytest.mark.release
 @pytest.mark.asyncio
-async def test_idefics_load(idefics, generate_load, response_snapshot):
-    chicken = get_chicken()
+async def test_idefics_load(idefics, generate_load, response_snapshot, chicken):
     responses = await generate_load(
         idefics,
         f"User:![]({chicken})Can you tell me a very short story based on the image?",
@@ -74,9 +56,7 @@ async def test_idefics_load(idefics, generate_load, response_snapshot):
 
     generated_texts = [r.generated_text for r in responses]
 
-    assert (
-        generated_texts[0] == " \nAssistant: A rooster stands"
-    ), f"{response.generated_text}"
+    assert generated_texts[0] == " \nAssistant: A rooster stands"
     assert len(generated_texts) == 4
     assert generated_texts, all(
         [text == generated_texts[0] for text in generated_texts]
diff --git a/integration-tests/models/test_idefics2.py b/integration-tests/models/test_idefics2.py
index 9aaf6d8ae4c2f2a7cf7964baf6790865630ffc58..881e37f9b95f678e661860ca61cecd1345e7aa6c 100644
--- a/integration-tests/models/test_idefics2.py
+++ b/integration-tests/models/test_idefics2.py
@@ -1,18 +1,4 @@
 import pytest
-import base64
-
-
-# TODO fix the server parsser to count inline image tokens correctly
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
-def get_cow_beach():
-    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 
 
 @pytest.fixture(scope="module")
@@ -31,8 +17,9 @@ async def flash_idefics2_next(flash_idefics2_next_handle):
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot):
-    chicken = get_chicken()
+async def test_flash_idefics2_next_simple(
+    flash_idefics2_next, response_snapshot, chicken
+):
     response = await flash_idefics2_next.generate(
         f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
         max_new_tokens=10,
@@ -46,9 +33,9 @@ async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot):
-    chicken = get_chicken()
-    cow_beach = get_cow_beach()
+async def test_flash_idefics2_two_images(
+    flash_idefics2_next, response_snapshot, chicken, cow_beach
+):
     response = await flash_idefics2_next.generate(
         f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
         max_new_tokens=20,
@@ -57,7 +44,7 @@ async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot)
         response.generated_text
         == " The cow is standing on the beach and the chicken is sitting on a pile of money."
     ), f"{repr(response.generated_text)}"
-    assert response.details.generated_tokens == 20
+    assert response.details.generated_tokens == 19
     assert response == response_snapshot
 
 
@@ -87,9 +74,8 @@ async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snap
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_idefics2_next_load(
-    flash_idefics2_next, generate_load, response_snapshot
+    flash_idefics2_next, generate_load, response_snapshot, chicken
 ):
-    chicken = get_chicken()
     responses = await generate_load(
         flash_idefics2_next,
         f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
diff --git a/integration-tests/models/test_llava_next.py b/integration-tests/models/test_llava_next.py
index ea277d713e090122f558f0b5b9d1582c56474da5..1ac8f172db791e4083f4c82f384f5a3912405dbc 100644
--- a/integration-tests/models/test_llava_next.py
+++ b/integration-tests/models/test_llava_next.py
@@ -1,12 +1,4 @@
 import pytest
-import base64
-
-
-# TODO fix the server parsser to count inline image tokens correctly
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 
 
 @pytest.fixture(scope="module")
@@ -29,8 +21,7 @@ async def flash_llava_next(flash_llava_next_handle):
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llava_next_simple(flash_llava_next, response_snapshot):
-    chicken = get_chicken()
+async def test_flash_llava_next_simple(flash_llava_next, response_snapshot, chicken):
     response = await flash_llava_next.generate(
         f"User:![]({chicken})Can you tell me a very short story based on the image?",
         max_new_tokens=10,
@@ -70,9 +61,8 @@ async def test_flash_llava_next_all_params(flash_llava_next, response_snapshot):
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llava_next_load(
-    flash_llava_next, generate_load, response_snapshot
+    flash_llava_next, generate_load, response_snapshot, chicken
 ):
-    chicken = get_chicken()
     responses = await generate_load(
         flash_llava_next,
         f"User:![]({chicken})Can you tell me a very short story based on the image?",
diff --git a/integration-tests/models/test_lora_mistral.py b/integration-tests/models/test_lora_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccdc148635b7a7955f4ecb962bb740c2f602c196
--- /dev/null
+++ b/integration-tests/models/test_lora_mistral.py
@@ -0,0 +1,134 @@
+import pytest
+import requests
+
+
+@pytest.fixture(scope="module")
+def lora_mistral_handle(launcher):
+    with launcher(
+        "mistralai/Mistral-7B-v0.1",
+        lora_adapters=[
+            "predibase/dbpedia",
+            "predibase/customer_support",
+        ],
+        cuda_graphs=[0],
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def lora_mistral(lora_mistral_handle):
+    await lora_mistral_handle.health(300)
+    return lora_mistral_handle.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_lora_mistral(lora_mistral, response_snapshot):
+    response = await lora_mistral.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+    assert response.details.generated_tokens == 10
+
+
+classification_prompt = """You are given the title and the body of an article below. Please determine the type of the article.\n### Title: Great White Whale\n\n### Body: Great White Whale is the debut album by the Canadian rock band Secret and Whisper. The album was in the works for about a year and was released on February 12 2008. A music video was shot in Pittsburgh for the album's first single XOXOXO. The album reached number 17 on iTunes's top 100 albums in its first week on sale.\n\n### Article Type:"""
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_lora_mistral_without_adapter(lora_mistral, response_snapshot):
+    response = requests.post(
+        f"{lora_mistral.base_url}/generate",
+        headers=lora_mistral.headers,
+        json={
+            "inputs": classification_prompt,
+            "parameters": {
+                "max_new_tokens": 40,
+                "details": True,
+            },
+        },
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert (
+        data["generated_text"]
+        == "\n\n### 1. News\n### 2. Blog\n### 3. Article\n### 4. Review\n### 5. Other\n\n\n\n\n\n\n\n\n"
+    )
+    assert data == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_lora_mistral_with_dbpedia_adapter(lora_mistral, response_snapshot):
+    response = requests.post(
+        f"{lora_mistral.base_url}/generate",
+        headers=lora_mistral.headers,
+        json={
+            "inputs": classification_prompt,
+            "parameters": {
+                "max_new_tokens": 40,
+                "adapter_id": "predibase/dbpedia",
+                "details": True,
+            },
+        },
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["generated_text"] == "  11"
+    assert data == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_lora_mistral_with_customer_support_adapter(
+    lora_mistral, response_snapshot
+):
+    print(lora_mistral.base_url)
+    print(lora_mistral.headers)
+    response = requests.post(
+        f"{lora_mistral.base_url}/generate",
+        headers=lora_mistral.headers,
+        json={
+            "inputs": "What are 3 unique words that describe you?",
+            "parameters": {
+                "max_new_tokens": 40,
+                "adapter_id": "predibase/customer_support",
+                "details": True,
+            },
+        },
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert (
+        data["generated_text"]
+        == "\n\nI’m not sure if I can come up with 3 unique words that describe me, but I’ll try.\n\n1. Creative\n2. Funny\n3."
+    )
+    assert data == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_lora_mistral_without_customer_support_adapter(
+    lora_mistral, response_snapshot
+):
+    response = requests.post(
+        f"{lora_mistral.base_url}/generate",
+        headers=lora_mistral.headers,
+        json={
+            "inputs": "What are 3 unique words that describe you?",
+            "parameters": {
+                "max_new_tokens": 40,
+                "details": True,
+            },
+        },
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert (
+        data["generated_text"]
+        == "\n\nI’m a very passionate person. I’m very driven. I’m very determined.\n\nWhat is your favorite thing about being a teacher?\n\nI love the fact"
+    )
+    assert data == response_snapshot
diff --git a/integration-tests/models/test_mamba.py b/integration-tests/models/test_mamba.py
index bc946de8c9a0402cb4fe31950958df0c94dbc616..85ed8fd1f5fd9760a1844440ad1f67148169a228 100644
--- a/integration-tests/models/test_mamba.py
+++ b/integration-tests/models/test_mamba.py
@@ -62,6 +62,7 @@ async def test_mamba_load(
     )
 
     assert len(responses) == 4
+    assert responses[0].generated_text == "\n\nDeep learning is a new type of machine"
     assert all([r.generated_text == responses[0].generated_text for r in responses])
     assert responses[0].generated_text == "\n\nDeep learning is a new type of machine"
 
diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..02781707e05a0b7cebf5734f5077650091039a42
--- /dev/null
+++ b/integration-tests/models/test_mllama.py
@@ -0,0 +1,90 @@
+import pytest
+import asyncio
+
+
+@pytest.fixture(scope="module")
+def mllama_handle(launcher):
+    with launcher("meta-llama/Llama-3.2-11B-Vision-Instruct", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def mllama(mllama_handle):
+    await mllama_handle.health(300)
+    return mllama_handle.client
+
+
+@pytest.mark.asyncio
+async def test_mllama_simpl(mllama, response_snapshot):
+    response = await mllama.chat(
+        max_tokens=10,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Can you tell me a very short story based on the image?",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
+                        },
+                    },
+                ],
+            },
+        ],
+    )
+
+    assert response.usage == {
+        "completion_tokens": 10,
+        "prompt_tokens": 50,
+        "total_tokens": 60,
+    }
+    assert (
+        response.choices[0].message.content
+        == "In a bustling city, a chicken named Cluck"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_mllama_load(mllama, generate_load, response_snapshot):
+    futures = [
+        mllama.chat(
+            max_tokens=10,
+            temperature=0.0,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Can you tell me a very short story based on the image?",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
+                            },
+                        },
+                    ],
+                },
+            ],
+        )
+        for i in range(4)
+    ]
+    responses = await asyncio.gather(*futures)
+
+    generated_texts = [response.choices[0].message.content for response in responses]
+
+    assert generated_texts[0] == "In a bustling city, a chicken named Cluck"
+    assert len(generated_texts) == 4
+    assert generated_texts, all(
+        [text == generated_texts[0] for text in generated_texts]
+    )
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_opt.py b/integration-tests/models/test_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbeb6376127a3eb8bfe808845af27c645e6be335
--- /dev/null
+++ b/integration-tests/models/test_opt.py
@@ -0,0 +1,19 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def opt_sharded_handle(launcher):
+    with launcher("facebook/opt-6.7b", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def opt_sharded(opt_sharded_handle):
+    await opt_sharded_handle.health(300)
+    return opt_sharded_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_opt(opt_sharded):
+    pass
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
index 0af3f66acc387edb2e28fb299219b0f879cf1498..98e75bb494289ee4dc1db8824978ac2501592c74 100644
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -1,13 +1,12 @@
 import pytest
-import json
-
-from text_generation.types import GrammarType
 
 
 @pytest.fixture(scope="module")
 def flash_llama_grammar_tools_handle(launcher):
     with launcher(
-        "TinyLlama/TinyLlama-1.1B-Chat-v1.0", num_shard=2, disable_grammar_support=False
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        num_shard=2,
+        disable_grammar_support=False,
     ) as handle:
         yield handle
 
@@ -39,6 +38,7 @@ tools = [
                     },
                 },
                 "required": ["location", "format"],
+                "additionalProperties": False,
             },
         },
     },
@@ -65,13 +65,13 @@ tools = [
                     },
                 },
                 "required": ["location", "format", "num_days"],
+                "additionalProperties": False,
             },
         },
     },
 ]
 
 
-@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_snapshot):
@@ -79,7 +79,7 @@ async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_sna
         max_tokens=100,
         seed=1,
         tools=tools,
-        presence_penalty=-1.1,
+        temperature=0.0,
         messages=[
             {
                 "role": "system",
@@ -91,22 +91,21 @@ async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_sna
             },
         ],
     )
-    assert response.choices[0].message.content == None
+    assert response.choices[0].message.content is None
     assert response.choices[0].message.tool_calls == [
         {
-            "id": 0,
+            "id": "0",
             "type": "function",
             "function": {
                 "description": None,
                 "name": "get_current_weather",
-                "arguments": {"format": "celsius", "location": "New York, NY"},
+                "arguments": {"format": "celsius", "location": "Brooklyn, NY"},
             },
         }
     ]
     assert response == response_snapshot
 
 
-@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_auto(
@@ -116,8 +115,8 @@ async def test_flash_llama_grammar_tools_auto(
         max_tokens=100,
         seed=1,
         tools=tools,
+        temperature=0.0,
         tool_choice="auto",
-        presence_penalty=-1.1,
         messages=[
             {
                 "role": "system",
@@ -129,15 +128,15 @@ async def test_flash_llama_grammar_tools_auto(
             },
         ],
     )
-    assert response.choices[0].message.content == None
+    assert response.choices[0].message.content is None
     assert response.choices[0].message.tool_calls == [
         {
-            "id": 0,
+            "id": "0",
             "type": "function",
             "function": {
                 "description": None,
                 "name": "get_current_weather",
-                "arguments": {"format": "celsius", "location": "New York, NY"},
+                "arguments": {"format": "celsius", "location": "Brooklyn, NY"},
             },
         }
     ]
@@ -145,7 +144,6 @@ async def test_flash_llama_grammar_tools_auto(
     assert response == response_snapshot
 
 
-@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_choice(
@@ -155,8 +153,8 @@ async def test_flash_llama_grammar_tools_choice(
         max_tokens=100,
         seed=1,
         tools=tools,
+        temperature=0.0,
         tool_choice="get_current_weather",
-        presence_penalty=-1.1,
         messages=[
             {
                 "role": "system",
@@ -168,15 +166,15 @@ async def test_flash_llama_grammar_tools_choice(
             },
         ],
     )
-    assert response.choices[0].message.content == None
+    assert response.choices[0].message.content is None
     assert response.choices[0].message.tool_calls == [
         {
-            "id": 0,
+            "id": "0",
             "type": "function",
             "function": {
                 "description": None,
                 "name": "get_current_weather",
-                "arguments": {"format": "celsius", "location": "New York, NY"},
+                "arguments": {"format": "celsius", "location": "Brooklyn, NY"},
             },
         }
     ]
@@ -184,7 +182,6 @@ async def test_flash_llama_grammar_tools_choice(
     assert response == response_snapshot
 
 
-@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_stream(
@@ -194,8 +191,8 @@ async def test_flash_llama_grammar_tools_stream(
         max_tokens=100,
         seed=1,
         tools=tools,
+        temperature=0.0,
         tool_choice="get_current_weather",
-        presence_penalty=-1.1,
         messages=[
             {
                 "role": "system",
@@ -210,14 +207,22 @@ async def test_flash_llama_grammar_tools_stream(
     )
 
     count = 0
+    tool_calls_generated = ""
+    last_response = None
     async for response in responses:
         count += 1
+        tool_calls_generated += response.choices[0].delta.tool_calls.function.arguments
+        last_response = response
+        assert response.choices[0].delta.content is None
 
-    assert count == 38
-    assert response == response_snapshot
+    assert (
+        tool_calls_generated
+        == '{"function": {"_name": "get_current_weather", "format": "celsius", "location": "Paris, France"}}<|eot_id|>'
+    )
+    assert count == 28
+    assert last_response == response_snapshot
 
 
-@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_insufficient_information(
@@ -225,35 +230,100 @@ async def test_flash_llama_grammar_tools_insufficient_information(
 ):
     responses = await flash_llama_grammar_tools.chat(
         max_tokens=100,
-        seed=8,
+        seed=24,
         tools=tools,
         tool_choice="auto",
         messages=[
             {
                 "role": "system",
-                "content": "ONLY RESPOND IF THE USER ASKS A WEATHER RELATED QUESTION",
+                "content": "You're a helpful assistant! Answer the users question best you can.",
             },
             {
                 "role": "user",
-                "content": "Tell me a story about 3 sea creatures",
+                "content": "Who are you?",
             },
         ],
         stream=False,
     )
 
-    assert responses.choices[0].message.content == None
-    assert responses.choices[0].message.tool_calls == [
-        {
-            "function": {
-                "arguments": {
-                    "error": "Cannot get current weather forecast from specified location and temperature unit. Please try again with different options."
-                },
-                "description": None,
-                "name": "notify_error",
-            },
-            "id": 0,
-            "type": "function",
-        }
-    ]
+    assert responses.choices[0].message.tool_calls is None
+    assert responses.choices[0].message.content == "I am an AI assistant"
 
     assert responses == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_insufficient_information_stream(
+    flash_llama_grammar_tools, response_snapshot
+):
+    responses = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=24,
+        tools=tools,
+        tool_choice="auto",
+        messages=[
+            {
+                "role": "system",
+                "content": "You're a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "Who are you?",
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    content_generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        content_generated += response.choices[0].delta.content
+        last_response = response
+        assert response.choices[0].delta.tool_calls is None
+
+    assert count == 5
+    assert content_generated == "I am an AI assistant"
+    assert last_response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_sea_creatures_stream(
+    flash_llama_grammar_tools, response_snapshot
+):
+    responses = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=24,
+        tools=tools,
+        tool_choice="auto",
+        messages=[
+            {
+                "role": "system",
+                "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
+            },
+            {
+                "role": "user",
+                "content": "Tell me a story about 3 sea creatures",
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    content_generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        content_generated += response.choices[0].delta.content
+        last_response = response
+        assert response.choices[0].delta.tool_calls is None
+
+    assert count == 62
+    assert (
+        content_generated
+        == "Once upon a time, in the ocean, there lived three sea creatures. There was a wise old octopus named Bob, a mischievous seagull named Sam, and a gentle sea turtle named Luna. They all lived together in a beautiful coral reef, surrounded by colorful fish and swaying sea fans"
+    )
+    assert last_response == response_snapshot
diff --git a/integration-tests/poetry.lock b/integration-tests/poetry.lock
index 3af99942fa631b7354282ee78779529ab3399b9b..56b146bc3ab0ecde7d05a234a3a8564c33cf0601 100644
--- a/integration-tests/poetry.lock
+++ b/integration-tests/poetry.lock
@@ -1,112 +1,127 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.4.0"
+description = "Happy Eyeballs for asyncio"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "aiohappyeyeballs-2.4.0-py3-none-any.whl", hash = "sha256:7ce92076e249169a13c2f49320d1967425eaf1f407522d707d59cac7628d62bd"},
+    {file = "aiohappyeyeballs-2.4.0.tar.gz", hash = "sha256:55a1714f084e63d49639800f95716da97a1f173d46a16dfcfda0016abb93b6b2"},
+]
 
 [[package]]
 name = "aiohttp"
-version = "3.8.5"
+version = "3.10.5"
 description = "Async http client/server framework (asyncio)"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"},
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"},
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"},
-    {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"},
-    {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"},
-    {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"},
-    {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"},
-    {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"},
-    {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"},
-    {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"},
-    {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"},
-    {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"},
+    {file = "aiohttp-3.10.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:18a01eba2574fb9edd5f6e5fb25f66e6ce061da5dab5db75e13fe1558142e0a3"},
+    {file = "aiohttp-3.10.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:94fac7c6e77ccb1ca91e9eb4cb0ac0270b9fb9b289738654120ba8cebb1189c6"},
+    {file = "aiohttp-3.10.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f1f1c75c395991ce9c94d3e4aa96e5c59c8356a15b1c9231e783865e2772699"},
+    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7acae3cf1a2a2361ec4c8e787eaaa86a94171d2417aae53c0cca6ca3118ff6"},
+    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94c4381ffba9cc508b37d2e536b418d5ea9cfdc2848b9a7fea6aebad4ec6aac1"},
+    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c31ad0c0c507894e3eaa843415841995bf8de4d6b2d24c6e33099f4bc9fc0d4f"},
+    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0912b8a8fadeb32ff67a3ed44249448c20148397c1ed905d5dac185b4ca547bb"},
+    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d93400c18596b7dc4794d48a63fb361b01a0d8eb39f28800dc900c8fbdaca91"},
+    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d00f3c5e0d764a5c9aa5a62d99728c56d455310bcc288a79cab10157b3af426f"},
+    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d742c36ed44f2798c8d3f4bc511f479b9ceef2b93f348671184139e7d708042c"},
+    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:814375093edae5f1cb31e3407997cf3eacefb9010f96df10d64829362ae2df69"},
+    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8224f98be68a84b19f48e0bdc14224b5a71339aff3a27df69989fa47d01296f3"},
+    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d9a487ef090aea982d748b1b0d74fe7c3950b109df967630a20584f9a99c0683"},
+    {file = "aiohttp-3.10.5-cp310-cp310-win32.whl", hash = "sha256:d9ef084e3dc690ad50137cc05831c52b6ca428096e6deb3c43e95827f531d5ef"},
+    {file = "aiohttp-3.10.5-cp310-cp310-win_amd64.whl", hash = "sha256:66bf9234e08fe561dccd62083bf67400bdbf1c67ba9efdc3dac03650e97c6088"},
+    {file = "aiohttp-3.10.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8c6a4e5e40156d72a40241a25cc226051c0a8d816610097a8e8f517aeacd59a2"},
+    {file = "aiohttp-3.10.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c634a3207a5445be65536d38c13791904fda0748b9eabf908d3fe86a52941cf"},
+    {file = "aiohttp-3.10.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4aff049b5e629ef9b3e9e617fa6e2dfeda1bf87e01bcfecaf3949af9e210105e"},
+    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1942244f00baaacaa8155eca94dbd9e8cc7017deb69b75ef67c78e89fdad3c77"},
+    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e04a1f2a65ad2f93aa20f9ff9f1b672bf912413e5547f60749fa2ef8a644e061"},
+    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7f2bfc0032a00405d4af2ba27f3c429e851d04fad1e5ceee4080a1c570476697"},
+    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424ae21498790e12eb759040bbb504e5e280cab64693d14775c54269fd1d2bb7"},
+    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:975218eee0e6d24eb336d0328c768ebc5d617609affaca5dbbd6dd1984f16ed0"},
+    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4120d7fefa1e2d8fb6f650b11489710091788de554e2b6f8347c7a20ceb003f5"},
+    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:b90078989ef3fc45cf9221d3859acd1108af7560c52397ff4ace8ad7052a132e"},
+    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ba5a8b74c2a8af7d862399cdedce1533642fa727def0b8c3e3e02fcb52dca1b1"},
+    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:02594361128f780eecc2a29939d9dfc870e17b45178a867bf61a11b2a4367277"},
+    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8fb4fc029e135859f533025bc82047334e24b0d489e75513144f25408ecaf058"},
+    {file = "aiohttp-3.10.5-cp311-cp311-win32.whl", hash = "sha256:e1ca1ef5ba129718a8fc827b0867f6aa4e893c56eb00003b7367f8a733a9b072"},
+    {file = "aiohttp-3.10.5-cp311-cp311-win_amd64.whl", hash = "sha256:349ef8a73a7c5665cca65c88ab24abe75447e28aa3bc4c93ea5093474dfdf0ff"},
+    {file = "aiohttp-3.10.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:305be5ff2081fa1d283a76113b8df7a14c10d75602a38d9f012935df20731487"},
+    {file = "aiohttp-3.10.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3a1c32a19ee6bbde02f1cb189e13a71b321256cc1d431196a9f824050b160d5a"},
+    {file = "aiohttp-3.10.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:61645818edd40cc6f455b851277a21bf420ce347baa0b86eaa41d51ef58ba23d"},
+    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c225286f2b13bab5987425558baa5cbdb2bc925b2998038fa028245ef421e75"},
+    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ba01ebc6175e1e6b7275c907a3a36be48a2d487549b656aa90c8a910d9f3178"},
+    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8eaf44ccbc4e35762683078b72bf293f476561d8b68ec8a64f98cf32811c323e"},
+    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1c43eb1ab7cbf411b8e387dc169acb31f0ca0d8c09ba63f9eac67829585b44f"},
+    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de7a5299827253023c55ea549444e058c0eb496931fa05d693b95140a947cb73"},
+    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4790f0e15f00058f7599dab2b206d3049d7ac464dc2e5eae0e93fa18aee9e7bf"},
+    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:44b324a6b8376a23e6ba25d368726ee3bc281e6ab306db80b5819999c737d820"},
+    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0d277cfb304118079e7044aad0b76685d30ecb86f83a0711fc5fb257ffe832ca"},
+    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:54d9ddea424cd19d3ff6128601a4a4d23d54a421f9b4c0fff740505813739a91"},
+    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4f1c9866ccf48a6df2b06823e6ae80573529f2af3a0992ec4fe75b1a510df8a6"},
+    {file = "aiohttp-3.10.5-cp312-cp312-win32.whl", hash = "sha256:dc4826823121783dccc0871e3f405417ac116055bf184ac04c36f98b75aacd12"},
+    {file = "aiohttp-3.10.5-cp312-cp312-win_amd64.whl", hash = "sha256:22c0a23a3b3138a6bf76fc553789cb1a703836da86b0f306b6f0dc1617398abc"},
+    {file = "aiohttp-3.10.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7f6b639c36734eaa80a6c152a238242bedcee9b953f23bb887e9102976343092"},
+    {file = "aiohttp-3.10.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f29930bc2921cef955ba39a3ff87d2c4398a0394ae217f41cb02d5c26c8b1b77"},
+    {file = "aiohttp-3.10.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f489a2c9e6455d87eabf907ac0b7d230a9786be43fbe884ad184ddf9e9c1e385"},
+    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:123dd5b16b75b2962d0fff566effb7a065e33cd4538c1692fb31c3bda2bfb972"},
+    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b98e698dc34966e5976e10bbca6d26d6724e6bdea853c7c10162a3235aba6e16"},
+    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3b9162bab7e42f21243effc822652dc5bb5e8ff42a4eb62fe7782bcbcdfacf6"},
+    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1923a5c44061bffd5eebeef58cecf68096e35003907d8201a4d0d6f6e387ccaa"},
+    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d55f011da0a843c3d3df2c2cf4e537b8070a419f891c930245f05d329c4b0689"},
+    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:afe16a84498441d05e9189a15900640a2d2b5e76cf4efe8cbb088ab4f112ee57"},
+    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f8112fb501b1e0567a1251a2fd0747baae60a4ab325a871e975b7bb67e59221f"},
+    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:1e72589da4c90337837fdfe2026ae1952c0f4a6e793adbbfbdd40efed7c63599"},
+    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4d46c7b4173415d8e583045fbc4daa48b40e31b19ce595b8d92cf639396c15d5"},
+    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:33e6bc4bab477c772a541f76cd91e11ccb6d2efa2b8d7d7883591dfb523e5987"},
+    {file = "aiohttp-3.10.5-cp313-cp313-win32.whl", hash = "sha256:c58c6837a2c2a7cf3133983e64173aec11f9c2cd8e87ec2fdc16ce727bcf1a04"},
+    {file = "aiohttp-3.10.5-cp313-cp313-win_amd64.whl", hash = "sha256:38172a70005252b6893088c0f5e8a47d173df7cc2b2bd88650957eb84fcf5022"},
+    {file = "aiohttp-3.10.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f6f18898ace4bcd2d41a122916475344a87f1dfdec626ecde9ee802a711bc569"},
+    {file = "aiohttp-3.10.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5ede29d91a40ba22ac1b922ef510aab871652f6c88ef60b9dcdf773c6d32ad7a"},
+    {file = "aiohttp-3.10.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:673f988370f5954df96cc31fd99c7312a3af0a97f09e407399f61583f30da9bc"},
+    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58718e181c56a3c02d25b09d4115eb02aafe1a732ce5714ab70326d9776457c3"},
+    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b38b1570242fbab8d86a84128fb5b5234a2f70c2e32f3070143a6d94bc854cf"},
+    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:074d1bff0163e107e97bd48cad9f928fa5a3eb4b9d33366137ffce08a63e37fe"},
+    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd31f176429cecbc1ba499d4aba31aaccfea488f418d60376b911269d3b883c5"},
+    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7384d0b87d4635ec38db9263e6a3f1eb609e2e06087f0aa7f63b76833737b471"},
+    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8989f46f3d7ef79585e98fa991e6ded55d2f48ae56d2c9fa5e491a6e4effb589"},
+    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:c83f7a107abb89a227d6c454c613e7606c12a42b9a4ca9c5d7dad25d47c776ae"},
+    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:cde98f323d6bf161041e7627a5fd763f9fd829bcfcd089804a5fdce7bb6e1b7d"},
+    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:676f94c5480d8eefd97c0c7e3953315e4d8c2b71f3b49539beb2aa676c58272f"},
+    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:2d21ac12dc943c68135ff858c3a989f2194a709e6e10b4c8977d7fcd67dfd511"},
+    {file = "aiohttp-3.10.5-cp38-cp38-win32.whl", hash = "sha256:17e997105bd1a260850272bfb50e2a328e029c941c2708170d9d978d5a30ad9a"},
+    {file = "aiohttp-3.10.5-cp38-cp38-win_amd64.whl", hash = "sha256:1c19de68896747a2aa6257ae4cf6ef59d73917a36a35ee9d0a6f48cff0f94db8"},
+    {file = "aiohttp-3.10.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7e2fe37ac654032db1f3499fe56e77190282534810e2a8e833141a021faaab0e"},
+    {file = "aiohttp-3.10.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f5bf3ead3cb66ab990ee2561373b009db5bc0e857549b6c9ba84b20bc462e172"},
+    {file = "aiohttp-3.10.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1b2c16a919d936ca87a3c5f0e43af12a89a3ce7ccbce59a2d6784caba945b68b"},
+    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad146dae5977c4dd435eb31373b3fe9b0b1bf26858c6fc452bf6af394067e10b"},
+    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c5c6fa16412b35999320f5c9690c0f554392dc222c04e559217e0f9ae244b92"},
+    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:95c4dc6f61d610bc0ee1edc6f29d993f10febfe5b76bb470b486d90bbece6b22"},
+    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da452c2c322e9ce0cfef392e469a26d63d42860f829026a63374fde6b5c5876f"},
+    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:898715cf566ec2869d5cb4d5fb4be408964704c46c96b4be267442d265390f32"},
+    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:391cc3a9c1527e424c6865e087897e766a917f15dddb360174a70467572ac6ce"},
+    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:380f926b51b92d02a34119d072f178d80bbda334d1a7e10fa22d467a66e494db"},
+    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce91db90dbf37bb6fa0997f26574107e1b9d5ff939315247b7e615baa8ec313b"},
+    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9093a81e18c45227eebe4c16124ebf3e0d893830c6aca7cc310bfca8fe59d857"},
+    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ee40b40aa753d844162dcc80d0fe256b87cba48ca0054f64e68000453caead11"},
+    {file = "aiohttp-3.10.5-cp39-cp39-win32.whl", hash = "sha256:03f2645adbe17f274444953bdea69f8327e9d278d961d85657cb0d06864814c1"},
+    {file = "aiohttp-3.10.5-cp39-cp39-win_amd64.whl", hash = "sha256:d17920f18e6ee090bdd3d0bfffd769d9f2cb4c8ffde3eb203777a3895c128862"},
+    {file = "aiohttp-3.10.5.tar.gz", hash = "sha256:f071854b47d39591ce9a17981c46790acb30518e2f83dfca8db2dfa091178691"},
 ]
 
 [package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
-async-timeout = ">=4.0.0a3,<5.0"
+async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
-charset-normalizer = ">=2.0,<4.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli", "aiodns", "cchardet"]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
 
 [[package]]
 name = "aiosignal"
@@ -124,13 +139,13 @@ frozenlist = ">=1.1.0"
 
 [[package]]
 name = "annotated-types"
-version = "0.6.0"
+version = "0.7.0"
 description = "Reusable constraint types to use with typing.Annotated"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"},
-    {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"},
+    {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
+    {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
 ]
 
 [[package]]
@@ -146,115 +161,131 @@ files = [
 
 [[package]]
 name = "attrs"
-version = "23.1.0"
+version = "24.2.0"
 description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"},
-    {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"},
+    {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
+    {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
 ]
 
 [package.extras]
-cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
-dev = ["attrs[docs,tests]", "pre-commit"]
-docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
-tests = ["attrs[tests-no-zope]", "zope-interface"]
-tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
 
 [[package]]
 name = "certifi"
-version = "2023.7.22"
+version = "2024.8.30"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
-    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
+    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
 ]
 
 [[package]]
 name = "charset-normalizer"
-version = "3.2.0"
+version = "3.3.2"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-win32.whl", hash = "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-win32.whl", hash = "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-win32.whl", hash = "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80"},
-    {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
+    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
+    {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
 ]
 
 [[package]]
@@ -268,46 +299,37 @@ files = [
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
 
-[[package]]
-name = "colored"
-version = "1.4.4"
-description = "Simple library for color and formatting to terminal"
-optional = false
-python-versions = "*"
-files = [
-    {file = "colored-1.4.4.tar.gz", hash = "sha256:04ff4d4dd514274fe3b99a21bb52fb96f2688c01e93fba7bef37221e7cb56ce0"},
-]
-
 [[package]]
 name = "docker"
-version = "6.1.3"
+version = "7.1.0"
 description = "A Python library for the Docker Engine API."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "docker-6.1.3-py3-none-any.whl", hash = "sha256:aecd2277b8bf8e506e484f6ab7aec39abe0038e29fa4a6d3ba86c3fe01844ed9"},
-    {file = "docker-6.1.3.tar.gz", hash = "sha256:aa6d17830045ba5ef0168d5eaa34d37beeb113948c413affe1d5991fc11f9a20"},
+    {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
+    {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
 ]
 
 [package.dependencies]
-packaging = ">=14.0"
 pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""}
 requests = ">=2.26.0"
 urllib3 = ">=1.26.0"
-websocket-client = ">=0.32.0"
 
 [package.extras]
+dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"]
+docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
 ssh = ["paramiko (>=2.4.3)"]
+websockets = ["websocket-client (>=1.3.0)"]
 
 [[package]]
 name = "exceptiongroup"
-version = "1.1.3"
+version = "1.2.2"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"},
-    {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"},
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
 ]
 
 [package.extras]
@@ -315,101 +337,115 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "filelock"
-version = "3.12.3"
+version = "3.16.0"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"},
-    {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"},
+    {file = "filelock-3.16.0-py3-none-any.whl", hash = "sha256:f6ed4c963184f4c84dd5557ce8fece759a3724b37b80c6c4f20a2f63a4dc6609"},
+    {file = "filelock-3.16.0.tar.gz", hash = "sha256:81de9eb8453c769b63369f87f11131a7ab04e367f8d97ad39dc230daa07e3bec"},
 ]
 
-[package.dependencies]
-typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""}
-
 [package.extras]
-docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"]
+docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.1.1)", "pytest (>=8.3.2)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.3)"]
+typing = ["typing-extensions (>=4.12.2)"]
 
 [[package]]
 name = "frozenlist"
-version = "1.4.0"
+version = "1.4.1"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"},
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"},
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"},
-    {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"},
-    {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"},
-    {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"},
-    {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"},
-    {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"},
-    {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"},
-    {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"},
-    {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
+    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
+    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
+    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
+    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
+    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
+    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
+    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
+    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
+    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
+    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
+    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
+    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
 ]
 
 [[package]]
 name = "fsspec"
-version = "2023.6.0"
+version = "2024.9.0"
 description = "File-system specification"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "fsspec-2023.6.0-py3-none-any.whl", hash = "sha256:1cbad1faef3e391fba6dc005ae9b5bdcbf43005c9167ce78c915549c352c869a"},
-    {file = "fsspec-2023.6.0.tar.gz", hash = "sha256:d0b2f935446169753e7a5c5c55681c54ea91996cc67be93c39a154fb3a2742af"},
+    {file = "fsspec-2024.9.0-py3-none-any.whl", hash = "sha256:a0947d552d8a6efa72cc2c730b12c41d043509156966cca4fb157b0f2a0c574b"},
+    {file = "fsspec-2024.9.0.tar.gz", hash = "sha256:4b0afb90c2f21832df142f292649035d80b421f60a9e1c027802e5a0da2b04e8"},
 ]
 
 [package.extras]
@@ -417,7 +453,8 @@ abfs = ["adlfs"]
 adl = ["adlfs"]
 arrow = ["pyarrow (>=1)"]
 dask = ["dask", "distributed"]
-devel = ["pytest", "pytest-cov"]
+dev = ["pre-commit", "ruff"]
+doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"]
 dropbox = ["dropbox", "dropboxdrivefs", "requests"]
 full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
 fuse = ["fusepy"]
@@ -427,29 +464,32 @@ github = ["requests"]
 gs = ["gcsfs"]
 gui = ["panel"]
 hdfs = ["pyarrow (>=1)"]
-http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"]
 libarchive = ["libarchive-c"]
 oci = ["ocifs"]
 s3 = ["s3fs"]
 sftp = ["paramiko"]
 smb = ["smbprotocol"]
 ssh = ["paramiko"]
+test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
+test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
+test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
 tqdm = ["tqdm"]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.16.4"
+version = "0.24.6"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.16.4-py3-none-any.whl", hash = "sha256:0d3df29932f334fead024afc7cb4cc5149d955238b8b5e42dcf9740d6995a349"},
-    {file = "huggingface_hub-0.16.4.tar.gz", hash = "sha256:608c7d4f3d368b326d1747f91523dbd1f692871e8e2e7a4750314a2dd8b63e14"},
+    {file = "huggingface_hub-0.24.6-py3-none-any.whl", hash = "sha256:a990f3232aa985fe749bc9474060cbad75e8b2f115f6665a9fda5b9c97818970"},
+    {file = "huggingface_hub-0.24.6.tar.gz", hash = "sha256:cc2579e761d070713eaa9c323e3debe39d5b464ae3a7261c39a9195b27bb8000"},
 ]
 
 [package.dependencies]
 filelock = "*"
-fsspec = "*"
+fsspec = ">=2023.5.0"
 packaging = ">=20.9"
 pyyaml = ">=5.1"
 requests = "*"
@@ -457,26 +497,28 @@ tqdm = ">=4.42.1"
 typing-extensions = ">=3.7.4.3"
 
 [package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
-inference = ["aiohttp", "pydantic"]
-quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"]
+hf-transfer = ["hf-transfer (>=0.1.4)"]
+inference = ["aiohttp", "minijinja (>=1.0)"]
+quality = ["mypy (==1.5.1)", "ruff (>=0.5.0)"]
 tensorflow = ["graphviz", "pydot", "tensorflow"]
-testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
-torch = ["torch"]
-typing = ["pydantic", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+tensorflow-testing = ["keras (<3.0)", "tensorflow"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
+torch = ["safetensors[torch]", "torch"]
+typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
 
 [[package]]
 name = "idna"
-version = "3.4"
+version = "3.8"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.6"
 files = [
-    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
-    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
+    {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"},
+    {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"},
 ]
 
 [[package]]
@@ -492,107 +534,173 @@ files = [
 
 [[package]]
 name = "multidict"
-version = "6.0.4"
+version = "6.1.0"
 description = "multidict implementation"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
-    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
-    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
-    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
-    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
-    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
-    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
-    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
-    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
-    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
-    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
-    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
-    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"},
+    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"},
+    {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"},
+    {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"},
+    {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"},
+    {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"},
+    {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"},
+    {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"},
+    {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"},
+    {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"},
+    {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"},
+    {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"},
+    {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"},
+    {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"},
+    {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"},
+    {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"},
+    {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
+
+[[package]]
+name = "numpy"
+version = "1.26.4"
+description = "Fundamental package for array computing in Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 
 [[package]]
 name = "packaging"
-version = "23.1"
+version = "24.1"
 description = "Core utilities for Python packages"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
-    {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
+    {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
+    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
 ]
 
 [[package]]
 name = "pluggy"
-version = "1.3.0"
+version = "1.5.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"},
-    {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"},
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
 ]
 
 [package.extras]
@@ -601,109 +709,120 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "pydantic"
-version = "2.6.4"
+version = "2.9.1"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic-2.6.4-py3-none-any.whl", hash = "sha256:cc46fce86607580867bdc3361ad462bab9c222ef042d3da86f2fb333e1d916c5"},
-    {file = "pydantic-2.6.4.tar.gz", hash = "sha256:b1704e0847db01817624a6b86766967f552dd9dbf3afba4004409f908dcc84e6"},
+    {file = "pydantic-2.9.1-py3-none-any.whl", hash = "sha256:7aff4db5fdf3cf573d4b3c30926a510a10e19a0774d38fc4967f78beb6deb612"},
+    {file = "pydantic-2.9.1.tar.gz", hash = "sha256:1363c7d975c7036df0db2b4a61f2e062fbc0aa5ab5f2772e0ffc7191a4f4bce2"},
 ]
 
 [package.dependencies]
-annotated-types = ">=0.4.0"
-pydantic-core = "2.16.3"
-typing-extensions = ">=4.6.1"
+annotated-types = ">=0.6.0"
+pydantic-core = "2.23.3"
+typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
+timezone = ["tzdata"]
 
 [[package]]
 name = "pydantic-core"
-version = "2.16.3"
-description = ""
+version = "2.23.3"
+description = "Core functionality for Pydantic validation and serialization"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_core-2.16.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:75b81e678d1c1ede0785c7f46690621e4c6e63ccd9192af1f0bd9d504bbb6bf4"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9c865a7ee6f93783bd5d781af5a4c43dadc37053a5b42f7d18dc019f8c9d2bd1"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:162e498303d2b1c036b957a1278fa0899d02b2842f1ff901b6395104c5554a45"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f583bd01bbfbff4eaee0868e6fc607efdfcc2b03c1c766b06a707abbc856187"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b926dd38db1519ed3043a4de50214e0d600d404099c3392f098a7f9d75029ff8"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:716b542728d4c742353448765aa7cdaa519a7b82f9564130e2b3f6766018c9ec"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ad7f7ee1a13d9cb49d8198cd7d7e3aa93e425f371a68235f784e99741561f"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bd87f48924f360e5d1c5f770d6155ce0e7d83f7b4e10c2f9ec001c73cf475c99"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0df446663464884297c793874573549229f9eca73b59360878f382a0fc085979"},
-    {file = "pydantic_core-2.16.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4df8a199d9f6afc5ae9a65f8f95ee52cae389a8c6b20163762bde0426275b7db"},
-    {file = "pydantic_core-2.16.3-cp310-none-win32.whl", hash = "sha256:456855f57b413f077dff513a5a28ed838dbbb15082ba00f80750377eed23d132"},
-    {file = "pydantic_core-2.16.3-cp310-none-win_amd64.whl", hash = "sha256:732da3243e1b8d3eab8c6ae23ae6a58548849d2e4a4e03a1924c8ddf71a387cb"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:519ae0312616026bf4cedc0fe459e982734f3ca82ee8c7246c19b650b60a5ee4"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b3992a322a5617ded0a9f23fd06dbc1e4bd7cf39bc4ccf344b10f80af58beacd"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d62da299c6ecb04df729e4b5c52dc0d53f4f8430b4492b93aa8de1f541c4aac"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2acca2be4bb2f2147ada8cac612f8a98fc09f41c89f87add7256ad27332c2fda"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1b662180108c55dfbf1280d865b2d116633d436cfc0bba82323554873967b340"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7c6ed0dc9d8e65f24f5824291550139fe6f37fac03788d4580da0d33bc00c97"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b1bb0827f56654b4437955555dc3aeeebeddc47c2d7ed575477f082622c49e"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e56f8186d6210ac7ece503193ec84104da7ceb98f68ce18c07282fcc2452e76f"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:936e5db01dd49476fa8f4383c259b8b1303d5dd5fb34c97de194560698cc2c5e"},
-    {file = "pydantic_core-2.16.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:33809aebac276089b78db106ee692bdc9044710e26f24a9a2eaa35a0f9fa70ba"},
-    {file = "pydantic_core-2.16.3-cp311-none-win32.whl", hash = "sha256:ded1c35f15c9dea16ead9bffcde9bb5c7c031bff076355dc58dcb1cb436c4721"},
-    {file = "pydantic_core-2.16.3-cp311-none-win_amd64.whl", hash = "sha256:d89ca19cdd0dd5f31606a9329e309d4fcbb3df860960acec32630297d61820df"},
-    {file = "pydantic_core-2.16.3-cp311-none-win_arm64.whl", hash = "sha256:6162f8d2dc27ba21027f261e4fa26f8bcb3cf9784b7f9499466a311ac284b5b9"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:0f56ae86b60ea987ae8bcd6654a887238fd53d1384f9b222ac457070b7ac4cff"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9bd22a2a639e26171068f8ebb5400ce2c1bc7d17959f60a3b753ae13c632975"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4204e773b4b408062960e65468d5346bdfe139247ee5f1ca2a378983e11388a2"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f651dd19363c632f4abe3480a7c87a9773be27cfe1341aef06e8759599454120"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aaf09e615a0bf98d406657e0008e4a8701b11481840be7d31755dc9f97c44053"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8e47755d8152c1ab5b55928ab422a76e2e7b22b5ed8e90a7d584268dd49e9c6b"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:500960cb3a0543a724a81ba859da816e8cf01b0e6aaeedf2c3775d12ee49cade"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf6204fe865da605285c34cf1172879d0314ff267b1c35ff59de7154f35fdc2e"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d33dd21f572545649f90c38c227cc8631268ba25c460b5569abebdd0ec5974ca"},
-    {file = "pydantic_core-2.16.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:49d5d58abd4b83fb8ce763be7794d09b2f50f10aa65c0f0c1696c677edeb7cbf"},
-    {file = "pydantic_core-2.16.3-cp312-none-win32.whl", hash = "sha256:f53aace168a2a10582e570b7736cc5bef12cae9cf21775e3eafac597e8551fbe"},
-    {file = "pydantic_core-2.16.3-cp312-none-win_amd64.whl", hash = "sha256:0d32576b1de5a30d9a97f300cc6a3f4694c428d956adbc7e6e2f9cad279e45ed"},
-    {file = "pydantic_core-2.16.3-cp312-none-win_arm64.whl", hash = "sha256:ec08be75bb268473677edb83ba71e7e74b43c008e4a7b1907c6d57e940bf34b6"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:b1f6f5938d63c6139860f044e2538baeee6f0b251a1816e7adb6cbce106a1f01"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2a1ef6a36fdbf71538142ed604ad19b82f67b05749512e47f247a6ddd06afdc7"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:704d35ecc7e9c31d48926150afada60401c55efa3b46cd1ded5a01bdffaf1d48"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d937653a696465677ed583124b94a4b2d79f5e30b2c46115a68e482c6a591c8a"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9803edf8e29bd825f43481f19c37f50d2b01899448273b3a7758441b512acf8"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:72282ad4892a9fb2da25defeac8c2e84352c108705c972db82ab121d15f14e6d"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f752826b5b8361193df55afcdf8ca6a57d0232653494ba473630a83ba50d8c9"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4384a8f68ddb31a0b0c3deae88765f5868a1b9148939c3f4121233314ad5532c"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4b2bf78342c40b3dc830880106f54328928ff03e357935ad26c7128bbd66ce8"},
-    {file = "pydantic_core-2.16.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:13dcc4802961b5f843a9385fc821a0b0135e8c07fc3d9949fd49627c1a5e6ae5"},
-    {file = "pydantic_core-2.16.3-cp38-none-win32.whl", hash = "sha256:e3e70c94a0c3841e6aa831edab1619ad5c511199be94d0c11ba75fe06efe107a"},
-    {file = "pydantic_core-2.16.3-cp38-none-win_amd64.whl", hash = "sha256:ecdf6bf5f578615f2e985a5e1f6572e23aa632c4bd1dc67f8f406d445ac115ed"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:bda1ee3e08252b8d41fa5537413ffdddd58fa73107171a126d3b9ff001b9b820"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:21b888c973e4f26b7a96491c0965a8a312e13be108022ee510248fe379a5fa23"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be0ec334369316fa73448cc8c982c01e5d2a81c95969d58b8f6e272884df0074"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5b6079cc452a7c53dd378c6f881ac528246b3ac9aae0f8eef98498a75657805"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ee8d5f878dccb6d499ba4d30d757111847b6849ae07acdd1205fffa1fc1253c"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7233d65d9d651242a68801159763d09e9ec96e8a158dbf118dc090cd77a104c9"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6119dc90483a5cb50a1306adb8d52c66e447da88ea44f323e0ae1a5fcb14256"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:578114bc803a4c1ff9946d977c221e4376620a46cf78da267d946397dc9514a8"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d8f99b147ff3fcf6b3cc60cb0c39ea443884d5559a30b1481e92495f2310ff2b"},
-    {file = "pydantic_core-2.16.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4ac6b4ce1e7283d715c4b729d8f9dab9627586dafce81d9eaa009dd7f25dd972"},
-    {file = "pydantic_core-2.16.3-cp39-none-win32.whl", hash = "sha256:e7774b570e61cb998490c5235740d475413a1f6de823169b4cf94e2fe9e9f6b2"},
-    {file = "pydantic_core-2.16.3-cp39-none-win_amd64.whl", hash = "sha256:9091632a25b8b87b9a605ec0e61f241c456e9248bfdcf7abdf344fdb169c81cf"},
-    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:36fa178aacbc277bc6b62a2c3da95226520da4f4e9e206fdf076484363895d2c"},
-    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:dcca5d2bf65c6fb591fff92da03f94cd4f315972f97c21975398bd4bd046854a"},
-    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a72fb9963cba4cd5793854fd12f4cfee731e86df140f59ff52a49b3552db241"},
-    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b60cc1a081f80a2105a59385b92d82278b15d80ebb3adb200542ae165cd7d183"},
-    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cbcc558401de90a746d02ef330c528f2e668c83350f045833543cd57ecead1ad"},
-    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:fee427241c2d9fb7192b658190f9f5fd6dfe41e02f3c1489d2ec1e6a5ab1e04a"},
-    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f4cb85f693044e0f71f394ff76c98ddc1bc0953e48c061725e540396d5c8a2e1"},
-    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b29eeb887aa931c2fcef5aa515d9d176d25006794610c264ddc114c053bf96fe"},
-    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a425479ee40ff021f8216c9d07a6a3b54b31c8267c6e17aa88b70d7ebd0e5e5b"},
-    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5c5cbc703168d1b7a838668998308018a2718c2130595e8e190220238addc96f"},
-    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99b6add4c0b39a513d323d3b93bc173dac663c27b99860dd5bf491b240d26137"},
-    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f76ee558751746d6a38f89d60b6228fa174e5172d143886af0f85aa306fd89"},
-    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:00ee1c97b5364b84cb0bd82e9bbf645d5e2871fb8c58059d158412fee2d33d8a"},
-    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:287073c66748f624be4cef893ef9174e3eb88fe0b8a78dc22e88eca4bc357ca6"},
-    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ed25e1835c00a332cb10c683cd39da96a719ab1dfc08427d476bce41b92531fc"},
-    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:86b3d0033580bd6bbe07590152007275bd7af95f98eaa5bd36f3da219dcd93da"},
-    {file = "pydantic_core-2.16.3.tar.gz", hash = "sha256:1cac689f80a3abab2d3c0048b29eea5751114054f032a941a32de4c852c59cad"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7f10a5d1b9281392f1bf507d16ac720e78285dfd635b05737c3911637601bae6"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3c09a7885dd33ee8c65266e5aa7fb7e2f23d49d8043f089989726391dd7350c5"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6470b5a1ec4d1c2e9afe928c6cb37eb33381cab99292a708b8cb9aa89e62429b"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9172d2088e27d9a185ea0a6c8cebe227a9139fd90295221d7d495944d2367700"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86fc6c762ca7ac8fbbdff80d61b2c59fb6b7d144aa46e2d54d9e1b7b0e780e01"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0cb80fd5c2df4898693aa841425ea1727b1b6d2167448253077d2a49003e0ed"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03667cec5daf43ac4995cefa8aaf58f99de036204a37b889c24a80927b629cec"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:047531242f8e9c2db733599f1c612925de095e93c9cc0e599e96cf536aaf56ba"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5499798317fff7f25dbef9347f4451b91ac2a4330c6669821c8202fd354c7bee"},
+    {file = "pydantic_core-2.23.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bbb5e45eab7624440516ee3722a3044b83fff4c0372efe183fd6ba678ff681fe"},
+    {file = "pydantic_core-2.23.3-cp310-none-win32.whl", hash = "sha256:8b5b3ed73abb147704a6e9f556d8c5cb078f8c095be4588e669d315e0d11893b"},
+    {file = "pydantic_core-2.23.3-cp310-none-win_amd64.whl", hash = "sha256:2b603cde285322758a0279995b5796d64b63060bfbe214b50a3ca23b5cee3e83"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:c889fd87e1f1bbeb877c2ee56b63bb297de4636661cc9bbfcf4b34e5e925bc27"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ea85bda3189fb27503af4c45273735bcde3dd31c1ab17d11f37b04877859ef45"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7f7f72f721223f33d3dc98a791666ebc6a91fa023ce63733709f4894a7dc611"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b2b55b0448e9da68f56b696f313949cda1039e8ec7b5d294285335b53104b61"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c24574c7e92e2c56379706b9a3f07c1e0c7f2f87a41b6ee86653100c4ce343e5"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2b05e6ccbee333a8f4b8f4d7c244fdb7a979e90977ad9c51ea31261e2085ce0"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2c409ce1c219c091e47cb03feb3c4ed8c2b8e004efc940da0166aaee8f9d6c8"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d965e8b325f443ed3196db890d85dfebbb09f7384486a77461347f4adb1fa7f8"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f56af3a420fb1ffaf43ece3ea09c2d27c444e7c40dcb7c6e7cf57aae764f2b48"},
+    {file = "pydantic_core-2.23.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5b01a078dd4f9a52494370af21aa52964e0a96d4862ac64ff7cea06e0f12d2c5"},
+    {file = "pydantic_core-2.23.3-cp311-none-win32.whl", hash = "sha256:560e32f0df04ac69b3dd818f71339983f6d1f70eb99d4d1f8e9705fb6c34a5c1"},
+    {file = "pydantic_core-2.23.3-cp311-none-win_amd64.whl", hash = "sha256:c744fa100fdea0d000d8bcddee95213d2de2e95b9c12be083370b2072333a0fa"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:e0ec50663feedf64d21bad0809f5857bac1ce91deded203efc4a84b31b2e4305"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:db6e6afcb95edbe6b357786684b71008499836e91f2a4a1e55b840955b341dbb"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98ccd69edcf49f0875d86942f4418a4e83eb3047f20eb897bffa62a5d419c8fa"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a678c1ac5c5ec5685af0133262103defb427114e62eafeda12f1357a12140162"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:01491d8b4d8db9f3391d93b0df60701e644ff0894352947f31fff3e52bd5c801"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fcf31facf2796a2d3b7fe338fe8640aa0166e4e55b4cb108dbfd1058049bf4cb"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7200fd561fb3be06827340da066df4311d0b6b8eb0c2116a110be5245dceb326"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc1636770a809dee2bd44dd74b89cc80eb41172bcad8af75dd0bc182c2666d4c"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:67a5def279309f2e23014b608c4150b0c2d323bd7bccd27ff07b001c12c2415c"},
+    {file = "pydantic_core-2.23.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:748bdf985014c6dd3e1e4cc3db90f1c3ecc7246ff5a3cd4ddab20c768b2f1dab"},
+    {file = "pydantic_core-2.23.3-cp312-none-win32.whl", hash = "sha256:255ec6dcb899c115f1e2a64bc9ebc24cc0e3ab097775755244f77360d1f3c06c"},
+    {file = "pydantic_core-2.23.3-cp312-none-win_amd64.whl", hash = "sha256:40b8441be16c1e940abebed83cd006ddb9e3737a279e339dbd6d31578b802f7b"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6daaf5b1ba1369a22c8b050b643250e3e5efc6a78366d323294aee54953a4d5f"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d015e63b985a78a3d4ccffd3bdf22b7c20b3bbd4b8227809b3e8e75bc37f9cb2"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3fc572d9b5b5cfe13f8e8a6e26271d5d13f80173724b738557a8c7f3a8a3791"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f6bd91345b5163ee7448bee201ed7dd601ca24f43f439109b0212e296eb5b423"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc379c73fd66606628b866f661e8785088afe2adaba78e6bbe80796baf708a63"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbdce4b47592f9e296e19ac31667daed8753c8367ebb34b9a9bd89dacaa299c9"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc3cf31edf405a161a0adad83246568647c54404739b614b1ff43dad2b02e6d5"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8e22b477bf90db71c156f89a55bfe4d25177b81fce4aa09294d9e805eec13855"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:0a0137ddf462575d9bce863c4c95bac3493ba8e22f8c28ca94634b4a1d3e2bb4"},
+    {file = "pydantic_core-2.23.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:203171e48946c3164fe7691fc349c79241ff8f28306abd4cad5f4f75ed80bc8d"},
+    {file = "pydantic_core-2.23.3-cp313-none-win32.whl", hash = "sha256:76bdab0de4acb3f119c2a4bff740e0c7dc2e6de7692774620f7452ce11ca76c8"},
+    {file = "pydantic_core-2.23.3-cp313-none-win_amd64.whl", hash = "sha256:37ba321ac2a46100c578a92e9a6aa33afe9ec99ffa084424291d84e456f490c1"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d063c6b9fed7d992bcbebfc9133f4c24b7a7f215d6b102f3e082b1117cddb72c"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6cb968da9a0746a0cf521b2b5ef25fc5a0bee9b9a1a8214e0a1cfaea5be7e8a4"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edbefe079a520c5984e30e1f1f29325054b59534729c25b874a16a5048028d16"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cbaaf2ef20d282659093913da9d402108203f7cb5955020bd8d1ae5a2325d1c4"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb539d7e5dc4aac345846f290cf504d2fd3c1be26ac4e8b5e4c2b688069ff4cf"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e6f33503c5495059148cc486867e1d24ca35df5fc064686e631e314d959ad5b"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:04b07490bc2f6f2717b10c3969e1b830f5720b632f8ae2f3b8b1542394c47a8e"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03795b9e8a5d7fda05f3873efc3f59105e2dcff14231680296b87b80bb327295"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c483dab0f14b8d3f0df0c6c18d70b21b086f74c87ab03c59250dbf6d3c89baba"},
+    {file = "pydantic_core-2.23.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8b2682038e255e94baf2c473dca914a7460069171ff5cdd4080be18ab8a7fd6e"},
+    {file = "pydantic_core-2.23.3-cp38-none-win32.whl", hash = "sha256:f4a57db8966b3a1d1a350012839c6a0099f0898c56512dfade8a1fe5fb278710"},
+    {file = "pydantic_core-2.23.3-cp38-none-win_amd64.whl", hash = "sha256:13dd45ba2561603681a2676ca56006d6dee94493f03d5cadc055d2055615c3ea"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:82da2f4703894134a9f000e24965df73cc103e31e8c31906cc1ee89fde72cbd8"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dd9be0a42de08f4b58a3cc73a123f124f65c24698b95a54c1543065baca8cf0e"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89b731f25c80830c76fdb13705c68fef6a2b6dc494402987c7ea9584fe189f5d"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6de1ec30c4bb94f3a69c9f5f2182baeda5b809f806676675e9ef6b8dc936f28"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb68b41c3fa64587412b104294b9cbb027509dc2f6958446c502638d481525ef"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c3980f2843de5184656aab58698011b42763ccba11c4a8c35936c8dd6c7068c"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94f85614f2cba13f62c3c6481716e4adeae48e1eaa7e8bac379b9d177d93947a"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:510b7fb0a86dc8f10a8bb43bd2f97beb63cffad1203071dc434dac26453955cd"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1eba2f7ce3e30ee2170410e2171867ea73dbd692433b81a93758ab2de6c64835"},
+    {file = "pydantic_core-2.23.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b259fd8409ab84b4041b7b3f24dcc41e4696f180b775961ca8142b5b21d0e70"},
+    {file = "pydantic_core-2.23.3-cp39-none-win32.whl", hash = "sha256:40d9bd259538dba2f40963286009bf7caf18b5112b19d2b55b09c14dde6db6a7"},
+    {file = "pydantic_core-2.23.3-cp39-none-win_amd64.whl", hash = "sha256:5a8cd3074a98ee70173a8633ad3c10e00dcb991ecec57263aacb4095c5efb958"},
+    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f399e8657c67313476a121a6944311fab377085ca7f490648c9af97fc732732d"},
+    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:6b5547d098c76e1694ba85f05b595720d7c60d342f24d5aad32c3049131fa5c4"},
+    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0dda0290a6f608504882d9f7650975b4651ff91c85673341789a476b1159f211"},
+    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65b6e5da855e9c55a0c67f4db8a492bf13d8d3316a59999cfbaf98cc6e401961"},
+    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:09e926397f392059ce0afdcac920df29d9c833256354d0c55f1584b0b70cf07e"},
+    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:87cfa0ed6b8c5bd6ae8b66de941cece179281239d482f363814d2b986b79cedc"},
+    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e61328920154b6a44d98cabcb709f10e8b74276bc709c9a513a8c37a18786cc4"},
+    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce3317d155628301d649fe5e16a99528d5680af4ec7aa70b90b8dacd2d725c9b"},
+    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e89513f014c6be0d17b00a9a7c81b1c426f4eb9224b15433f3d98c1a071f8433"},
+    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:4f62c1c953d7ee375df5eb2e44ad50ce2f5aff931723b398b8bc6f0ac159791a"},
+    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2718443bc671c7ac331de4eef9b673063b10af32a0bb385019ad61dcf2cc8f6c"},
+    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0d90e08b2727c5d01af1b5ef4121d2f0c99fbee692c762f4d9d0409c9da6541"},
+    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2b676583fc459c64146debea14ba3af54e540b61762dfc0613dc4e98c3f66eeb"},
+    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:50e4661f3337977740fdbfbae084ae5693e505ca2b3130a6d4eb0f2281dc43b8"},
+    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:68f4cf373f0de6abfe599a38307f4417c1c867ca381c03df27c873a9069cda25"},
+    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:59d52cf01854cb26c46958552a21acb10dd78a52aa34c86f284e66b209db8cab"},
+    {file = "pydantic_core-2.23.3.tar.gz", hash = "sha256:3cb0f65d8b4121c1b015c60104a685feb929a29d7cf204387c7f2688c7974690"},
 ]
 
 [package.dependencies]
@@ -711,13 +830,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
 [[package]]
 name = "pytest"
-version = "7.4.0"
+version = "7.4.4"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"},
-    {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"},
+    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
+    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
 ]
 
 [package.dependencies]
@@ -733,13 +852,13 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no
 
 [[package]]
 name = "pytest-asyncio"
-version = "0.21.1"
+version = "0.21.2"
 description = "Pytest support for asyncio"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-asyncio-0.21.1.tar.gz", hash = "sha256:40a7eae6dded22c7b604986855ea48400ab15b069ae38116e8c01238e9eeb64d"},
-    {file = "pytest_asyncio-0.21.1-py3-none-any.whl", hash = "sha256:8666c1c8ac02631d7c51ba282e0c69a8a452b211ffedf2599099845da5c5c37b"},
+    {file = "pytest_asyncio-0.21.2-py3-none-any.whl", hash = "sha256:ab664c88bb7998f711d8039cacd4884da6430886ae8bbd4eded552ed2004f16b"},
+    {file = "pytest_asyncio-0.21.2.tar.gz", hash = "sha256:d67738fc232b94b326b9d060750beb16e0074210b98dd8b58a5239fa2a154f45"},
 ]
 
 [package.dependencies]
@@ -774,73 +893,75 @@ files = [
 
 [[package]]
 name = "pyyaml"
-version = "6.0.1"
+version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
-    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
-    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
-    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
-    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
-    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
-    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
 
 [[package]]
 name = "requests"
-version = "2.31.0"
+version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
-    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
 ]
 
 [package.dependencies]
@@ -855,18 +976,17 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "syrupy"
-version = "4.0.1"
+version = "4.7.1"
 description = "Pytest Snapshot Test Utility"
 optional = false
-python-versions = ">=3.8.1,<4"
+python-versions = ">=3.8.1"
 files = [
-    {file = "syrupy-4.0.1-py3-none-any.whl", hash = "sha256:53d3107cc5e18a5def189c721879cea2cdafdee34b879f602133ca08837d0e4b"},
-    {file = "syrupy-4.0.1.tar.gz", hash = "sha256:60e3e94782444e0f978cd3b207de32f6da3199b15a2db32eab02f83cebb63ae8"},
+    {file = "syrupy-4.7.1-py3-none-any.whl", hash = "sha256:be002267a512a4bedddfae2e026c93df1ea928ae10baadc09640516923376d41"},
+    {file = "syrupy-4.7.1.tar.gz", hash = "sha256:f9d4485f3f27d0e5df6ed299cac6fa32eb40a441915d988e82be5a4bdda335c8"},
 ]
 
 [package.dependencies]
-colored = ">=1.3.92,<2.0.0"
-pytest = ">=7.0.0,<8.0.0"
+pytest = ">=7.0.0,<9.0.0"
 
 [[package]]
 name = "text-generation"
@@ -897,13 +1017,13 @@ files = [
 
 [[package]]
 name = "tqdm"
-version = "4.66.1"
+version = "4.66.5"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
-    {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
+    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
+    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
 ]
 
 [package.dependencies]
@@ -917,129 +1037,131 @@ telegram = ["requests"]
 
 [[package]]
 name = "typing-extensions"
-version = "4.7.1"
-description = "Backported and Experimental Type Hints for Python 3.7+"
+version = "4.12.2"
+description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"},
-    {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"},
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
 ]
 
 [[package]]
 name = "urllib3"
-version = "2.0.4"
+version = "2.2.2"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "urllib3-2.0.4-py3-none-any.whl", hash = "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4"},
-    {file = "urllib3-2.0.4.tar.gz", hash = "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11"},
+    {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"},
+    {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"},
 ]
 
 [package.extras]
 brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"]
+h2 = ["h2 (>=4,<5)"]
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
 
-[[package]]
-name = "websocket-client"
-version = "1.6.2"
-description = "WebSocket client for Python with low level API options"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "websocket-client-1.6.2.tar.gz", hash = "sha256:53e95c826bf800c4c465f50093a8c4ff091c7327023b10bfaff40cf1ef170eaa"},
-    {file = "websocket_client-1.6.2-py3-none-any.whl", hash = "sha256:ce54f419dfae71f4bdba69ebe65bf7f0a93fe71bc009ad3a010aacc3eebad537"},
-]
-
-[package.extras]
-docs = ["Sphinx (>=6.0)", "sphinx-rtd-theme (>=1.1.0)"]
-optional = ["python-socks", "wsaccel"]
-test = ["websockets"]
-
 [[package]]
 name = "yarl"
-version = "1.9.2"
+version = "1.11.1"
 description = "Yet another URL library"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"},
-    {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8"},
-    {file = "yarl-1.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0c77533b5ed4bcc38e943178ccae29b9bcf48ffd1063f5821192f23a1bd27b9"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee4afac41415d52d53a9833ebae7e32b344be72835bbb589018c9e938045a560"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bf345c3a4f5ba7f766430f97f9cc1320786f19584acc7086491f45524a551ac"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a96c19c52ff442a808c105901d0bdfd2e28575b3d5f82e2f5fd67e20dc5f4ea"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:891c0e3ec5ec881541f6c5113d8df0315ce5440e244a716b95f2525b7b9f3608"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c3a53ba34a636a256d767c086ceb111358876e1fb6b50dfc4d3f4951d40133d5"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:566185e8ebc0898b11f8026447eacd02e46226716229cea8db37496c8cdd26e0"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2b0738fb871812722a0ac2154be1f049c6223b9f6f22eec352996b69775b36d4"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:32f1d071b3f362c80f1a7d322bfd7b2d11e33d2adf395cc1dd4df36c9c243095"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e9fdc7ac0d42bc3ea78818557fab03af6181e076a2944f43c38684b4b6bed8e3"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56ff08ab5df8429901ebdc5d15941b59f6253393cb5da07b4170beefcf1b2528"},
-    {file = "yarl-1.9.2-cp310-cp310-win32.whl", hash = "sha256:8ea48e0a2f931064469bdabca50c2f578b565fc446f302a79ba6cc0ee7f384d3"},
-    {file = "yarl-1.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:50f33040f3836e912ed16d212f6cc1efb3231a8a60526a407aeb66c1c1956dde"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:646d663eb2232d7909e6601f1a9107e66f9791f290a1b3dc7057818fe44fc2b6"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aff634b15beff8902d1f918012fc2a42e0dbae6f469fce134c8a0dc51ca423bb"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a83503934c6273806aed765035716216cc9ab4e0364f7f066227e1aaea90b8d0"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b25322201585c69abc7b0e89e72790469f7dad90d26754717f3310bfe30331c2"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22a94666751778629f1ec4280b08eb11815783c63f52092a5953faf73be24191"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ec53a0ea2a80c5cd1ab397925f94bff59222aa3cf9c6da938ce05c9ec20428d"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:159d81f22d7a43e6eabc36d7194cb53f2f15f498dbbfa8edc8a3239350f59fe7"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:832b7e711027c114d79dffb92576acd1bd2decc467dec60e1cac96912602d0e6"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:95d2ecefbcf4e744ea952d073c6922e72ee650ffc79028eb1e320e732898d7e8"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d4e2c6d555e77b37288eaf45b8f60f0737c9efa3452c6c44626a5455aeb250b9"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:783185c75c12a017cc345015ea359cc801c3b29a2966c2655cd12b233bf5a2be"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:b8cc1863402472f16c600e3e93d542b7e7542a540f95c30afd472e8e549fc3f7"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:822b30a0f22e588b32d3120f6d41e4ed021806418b4c9f0bc3048b8c8cb3f92a"},
-    {file = "yarl-1.9.2-cp311-cp311-win32.whl", hash = "sha256:a60347f234c2212a9f0361955007fcf4033a75bf600a33c88a0a8e91af77c0e8"},
-    {file = "yarl-1.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:be6b3fdec5c62f2a67cb3f8c6dbf56bbf3f61c0f046f84645cd1ca73532ea051"},
-    {file = "yarl-1.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38a3928ae37558bc1b559f67410df446d1fbfa87318b124bf5032c31e3447b74"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac9bb4c5ce3975aeac288cfcb5061ce60e0d14d92209e780c93954076c7c4367"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3da8a678ca8b96c8606bbb8bfacd99a12ad5dd288bc6f7979baddd62f71c63ef"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13414591ff516e04fcdee8dc051c13fd3db13b673c7a4cb1350e6b2ad9639ad3"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf74d08542c3a9ea97bb8f343d4fcbd4d8f91bba5ec9d5d7f792dbe727f88938"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e7221580dc1db478464cfeef9b03b95c5852cc22894e418562997df0d074ccc"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:494053246b119b041960ddcd20fd76224149cfea8ed8777b687358727911dd33"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:52a25809fcbecfc63ac9ba0c0fb586f90837f5425edfd1ec9f3372b119585e45"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:e65610c5792870d45d7b68c677681376fcf9cc1c289f23e8e8b39c1485384185"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:1b1bba902cba32cdec51fca038fd53f8beee88b77efc373968d1ed021024cc04"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:662e6016409828ee910f5d9602a2729a8a57d74b163c89a837de3fea050c7582"},
-    {file = "yarl-1.9.2-cp37-cp37m-win32.whl", hash = "sha256:f364d3480bffd3aa566e886587eaca7c8c04d74f6e8933f3f2c996b7f09bee1b"},
-    {file = "yarl-1.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6a5883464143ab3ae9ba68daae8e7c5c95b969462bbe42e2464d60e7e2698368"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5610f80cf43b6202e2c33ba3ec2ee0a2884f8f423c8f4f62906731d876ef4fac"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9a4e67ad7b646cd6f0938c7ebfd60e481b7410f574c560e455e938d2da8e0f4"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83fcc480d7549ccebe9415d96d9263e2d4226798c37ebd18c930fce43dfb9574"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fcd436ea16fee7d4207c045b1e340020e58a2597301cfbcfdbe5abd2356c2fb"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84e0b1599334b1e1478db01b756e55937d4614f8654311eb26012091be109d59"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3458a24e4ea3fd8930e934c129b676c27452e4ebda80fbe47b56d8c6c7a63a9e"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:838162460b3a08987546e881a2bfa573960bb559dfa739e7800ceeec92e64417"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4e2d08f07a3d7d3e12549052eb5ad3eab1c349c53ac51c209a0e5991bbada78"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:de119f56f3c5f0e2fb4dee508531a32b069a5f2c6e827b272d1e0ff5ac040333"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:149ddea5abf329752ea5051b61bd6c1d979e13fbf122d3a1f9f0c8be6cb6f63c"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:674ca19cbee4a82c9f54e0d1eee28116e63bc6fd1e96c43031d11cbab8b2afd5"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:9b3152f2f5677b997ae6c804b73da05a39daa6a9e85a512e0e6823d81cdad7cc"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5415d5a4b080dc9612b1b63cba008db84e908b95848369aa1da3686ae27b6d2b"},
-    {file = "yarl-1.9.2-cp38-cp38-win32.whl", hash = "sha256:f7a3d8146575e08c29ed1cd287068e6d02f1c7bdff8970db96683b9591b86ee7"},
-    {file = "yarl-1.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:63c48f6cef34e6319a74c727376e95626f84ea091f92c0250a98e53e62c77c72"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:75df5ef94c3fdc393c6b19d80e6ef1ecc9ae2f4263c09cacb178d871c02a5ba9"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c027a6e96ef77d401d8d5a5c8d6bc478e8042f1e448272e8d9752cb0aff8b5c8"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3b078dbe227f79be488ffcfc7a9edb3409d018e0952cf13f15fd6512847f3f7"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59723a029760079b7d991a401386390c4be5bfec1e7dd83e25a6a0881859e716"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b03917871bf859a81ccb180c9a2e6c1e04d2f6a51d953e6a5cdd70c93d4e5a2a"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c1012fa63eb6c032f3ce5d2171c267992ae0c00b9e164efe4d73db818465fac3"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74dcbfe780e62f4b5a062714576f16c2f3493a0394e555ab141bf0d746bb955"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c56986609b057b4839968ba901944af91b8e92f1725d1a2d77cbac6972b9ed1"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2c315df3293cd521033533d242d15eab26583360b58f7ee5d9565f15fee1bef4"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b7232f8dfbd225d57340e441d8caf8652a6acd06b389ea2d3222b8bc89cbfca6"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:53338749febd28935d55b41bf0bcc79d634881195a39f6b2f767870b72514caf"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8288d7cd28f8119b07dd49b7230d6b4562f9b61ee9a4ab02221060d21136be80"},
-    {file = "yarl-1.9.2-cp39-cp39-win32.whl", hash = "sha256:b124e2a6d223b65ba8768d5706d103280914d61f5cae3afbc50fc3dfcc016623"},
-    {file = "yarl-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:61016e7d582bc46a5378ffdd02cd0314fb8ba52f40f9cf4d9a5e7dbef88dee18"},
-    {file = "yarl-1.9.2.tar.gz", hash = "sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571"},
+    {file = "yarl-1.11.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:400cd42185f92de559d29eeb529e71d80dfbd2f45c36844914a4a34297ca6f00"},
+    {file = "yarl-1.11.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8258c86f47e080a258993eed877d579c71da7bda26af86ce6c2d2d072c11320d"},
+    {file = "yarl-1.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2164cd9725092761fed26f299e3f276bb4b537ca58e6ff6b252eae9631b5c96e"},
+    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08ea567c16f140af8ddc7cb58e27e9138a1386e3e6e53982abaa6f2377b38cc"},
+    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:768ecc550096b028754ea28bf90fde071c379c62c43afa574edc6f33ee5daaec"},
+    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2909fa3a7d249ef64eeb2faa04b7957e34fefb6ec9966506312349ed8a7e77bf"},
+    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01a8697ec24f17c349c4f655763c4db70eebc56a5f82995e5e26e837c6eb0e49"},
+    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e286580b6511aac7c3268a78cdb861ec739d3e5a2a53b4809faef6b49778eaff"},
+    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4179522dc0305c3fc9782549175c8e8849252fefeb077c92a73889ccbcd508ad"},
+    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27fcb271a41b746bd0e2a92182df507e1c204759f460ff784ca614e12dd85145"},
+    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f61db3b7e870914dbd9434b560075e0366771eecbe6d2b5561f5bc7485f39efd"},
+    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:c92261eb2ad367629dc437536463dc934030c9e7caca861cc51990fe6c565f26"},
+    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d95b52fbef190ca87d8c42f49e314eace4fc52070f3dfa5f87a6594b0c1c6e46"},
+    {file = "yarl-1.11.1-cp310-cp310-win32.whl", hash = "sha256:489fa8bde4f1244ad6c5f6d11bb33e09cf0d1d0367edb197619c3e3fc06f3d91"},
+    {file = "yarl-1.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:476e20c433b356e16e9a141449f25161e6b69984fb4cdbd7cd4bd54c17844998"},
+    {file = "yarl-1.11.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:946eedc12895873891aaceb39bceb484b4977f70373e0122da483f6c38faaa68"},
+    {file = "yarl-1.11.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:21a7c12321436b066c11ec19c7e3cb9aec18884fe0d5b25d03d756a9e654edfe"},
+    {file = "yarl-1.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c35f493b867912f6fda721a59cc7c4766d382040bdf1ddaeeaa7fa4d072f4675"},
+    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25861303e0be76b60fddc1250ec5986c42f0a5c0c50ff57cc30b1be199c00e63"},
+    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4b53f73077e839b3f89c992223f15b1d2ab314bdbdf502afdc7bb18e95eae27"},
+    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:327c724b01b8641a1bf1ab3b232fb638706e50f76c0b5bf16051ab65c868fac5"},
+    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4307d9a3417eea87715c9736d050c83e8c1904e9b7aada6ce61b46361b733d92"},
+    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48a28bed68ab8fb7e380775f0029a079f08a17799cb3387a65d14ace16c12e2b"},
+    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:067b961853c8e62725ff2893226fef3d0da060656a9827f3f520fb1d19b2b68a"},
+    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8215f6f21394d1f46e222abeb06316e77ef328d628f593502d8fc2a9117bde83"},
+    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:498442e3af2a860a663baa14fbf23fb04b0dd758039c0e7c8f91cb9279799bff"},
+    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:69721b8effdb588cb055cc22f7c5105ca6fdaa5aeb3ea09021d517882c4a904c"},
+    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1e969fa4c1e0b1a391f3fcbcb9ec31e84440253325b534519be0d28f4b6b533e"},
+    {file = "yarl-1.11.1-cp311-cp311-win32.whl", hash = "sha256:7d51324a04fc4b0e097ff8a153e9276c2593106a811704025bbc1d6916f45ca6"},
+    {file = "yarl-1.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:15061ce6584ece023457fb8b7a7a69ec40bf7114d781a8c4f5dcd68e28b5c53b"},
+    {file = "yarl-1.11.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:a4264515f9117be204935cd230fb2a052dd3792789cc94c101c535d349b3dab0"},
+    {file = "yarl-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f41fa79114a1d2eddb5eea7b912d6160508f57440bd302ce96eaa384914cd265"},
+    {file = "yarl-1.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:02da8759b47d964f9173c8675710720b468aa1c1693be0c9c64abb9d8d9a4867"},
+    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9361628f28f48dcf8b2f528420d4d68102f593f9c2e592bfc842f5fb337e44fd"},
+    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b91044952da03b6f95fdba398d7993dd983b64d3c31c358a4c89e3c19b6f7aef"},
+    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:74db2ef03b442276d25951749a803ddb6e270d02dda1d1c556f6ae595a0d76a8"},
+    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e975a2211952a8a083d1b9d9ba26472981ae338e720b419eb50535de3c02870"},
+    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aef97ba1dd2138112890ef848e17d8526fe80b21f743b4ee65947ea184f07a2"},
+    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a7915ea49b0c113641dc4d9338efa9bd66b6a9a485ffe75b9907e8573ca94b84"},
+    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:504cf0d4c5e4579a51261d6091267f9fd997ef58558c4ffa7a3e1460bd2336fa"},
+    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3de5292f9f0ee285e6bd168b2a77b2a00d74cbcfa420ed078456d3023d2f6dff"},
+    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a34e1e30f1774fa35d37202bbeae62423e9a79d78d0874e5556a593479fdf239"},
+    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:66b63c504d2ca43bf7221a1f72fbe981ff56ecb39004c70a94485d13e37ebf45"},
+    {file = "yarl-1.11.1-cp312-cp312-win32.whl", hash = "sha256:a28b70c9e2213de425d9cba5ab2e7f7a1c8ca23a99c4b5159bf77b9c31251447"},
+    {file = "yarl-1.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:17b5a386d0d36fb828e2fb3ef08c8829c1ebf977eef88e5367d1c8c94b454639"},
+    {file = "yarl-1.11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1fa2e7a406fbd45b61b4433e3aa254a2c3e14c4b3186f6e952d08a730807fa0c"},
+    {file = "yarl-1.11.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:750f656832d7d3cb0c76be137ee79405cc17e792f31e0a01eee390e383b2936e"},
+    {file = "yarl-1.11.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b8486f322d8f6a38539136a22c55f94d269addb24db5cb6f61adc61eabc9d93"},
+    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fce4da3703ee6048ad4138fe74619c50874afe98b1ad87b2698ef95bf92c96d"},
+    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ed653638ef669e0efc6fe2acb792275cb419bf9cb5c5049399f3556995f23c7"},
+    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18ac56c9dd70941ecad42b5a906820824ca72ff84ad6fa18db33c2537ae2e089"},
+    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:688654f8507464745ab563b041d1fb7dab5d9912ca6b06e61d1c4708366832f5"},
+    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4973eac1e2ff63cf187073cd4e1f1148dcd119314ab79b88e1b3fad74a18c9d5"},
+    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:964a428132227edff96d6f3cf261573cb0f1a60c9a764ce28cda9525f18f7786"},
+    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:6d23754b9939cbab02c63434776df1170e43b09c6a517585c7ce2b3d449b7318"},
+    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c2dc4250fe94d8cd864d66018f8344d4af50e3758e9d725e94fecfa27588ff82"},
+    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09696438cb43ea6f9492ef237761b043f9179f455f405279e609f2bc9100212a"},
+    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:999bfee0a5b7385a0af5ffb606393509cfde70ecca4f01c36985be6d33e336da"},
+    {file = "yarl-1.11.1-cp313-cp313-win32.whl", hash = "sha256:ce928c9c6409c79e10f39604a7e214b3cb69552952fbda8d836c052832e6a979"},
+    {file = "yarl-1.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:501c503eed2bb306638ccb60c174f856cc3246c861829ff40eaa80e2f0330367"},
+    {file = "yarl-1.11.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dae7bd0daeb33aa3e79e72877d3d51052e8b19c9025ecf0374f542ea8ec120e4"},
+    {file = "yarl-1.11.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3ff6b1617aa39279fe18a76c8d165469c48b159931d9b48239065767ee455b2b"},
+    {file = "yarl-1.11.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3257978c870728a52dcce8c2902bf01f6c53b65094b457bf87b2644ee6238ddc"},
+    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f351fa31234699d6084ff98283cb1e852270fe9e250a3b3bf7804eb493bd937"},
+    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8aef1b64da41d18026632d99a06b3fefe1d08e85dd81d849fa7c96301ed22f1b"},
+    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7175a87ab8f7fbde37160a15e58e138ba3b2b0e05492d7351314a250d61b1591"},
+    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba444bdd4caa2a94456ef67a2f383710928820dd0117aae6650a4d17029fa25e"},
+    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ea9682124fc062e3d931c6911934a678cb28453f957ddccf51f568c2f2b5e05"},
+    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8418c053aeb236b20b0ab8fa6bacfc2feaaf7d4683dd96528610989c99723d5f"},
+    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:61a5f2c14d0a1adfdd82258f756b23a550c13ba4c86c84106be4c111a3a4e413"},
+    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f3a6d90cab0bdf07df8f176eae3a07127daafcf7457b997b2bf46776da2c7eb7"},
+    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:077da604852be488c9a05a524068cdae1e972b7dc02438161c32420fb4ec5e14"},
+    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:15439f3c5c72686b6c3ff235279630d08936ace67d0fe5c8d5bbc3ef06f5a420"},
+    {file = "yarl-1.11.1-cp38-cp38-win32.whl", hash = "sha256:238a21849dd7554cb4d25a14ffbfa0ef380bb7ba201f45b144a14454a72ffa5a"},
+    {file = "yarl-1.11.1-cp38-cp38-win_amd64.whl", hash = "sha256:67459cf8cf31da0e2cbdb4b040507e535d25cfbb1604ca76396a3a66b8ba37a6"},
+    {file = "yarl-1.11.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:884eab2ce97cbaf89f264372eae58388862c33c4f551c15680dd80f53c89a269"},
+    {file = "yarl-1.11.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a336eaa7ee7e87cdece3cedb395c9657d227bfceb6781295cf56abcd3386a26"},
+    {file = "yarl-1.11.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:87f020d010ba80a247c4abc335fc13421037800ca20b42af5ae40e5fd75e7909"},
+    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:637c7ddb585a62d4469f843dac221f23eec3cbad31693b23abbc2c366ad41ff4"},
+    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:48dfd117ab93f0129084577a07287376cc69c08138694396f305636e229caa1a"},
+    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e0ae31fb5ccab6eda09ba1494e87eb226dcbd2372dae96b87800e1dcc98804"},
+    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f46f81501160c28d0c0b7333b4f7be8983dbbc161983b6fb814024d1b4952f79"},
+    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:04293941646647b3bfb1719d1d11ff1028e9c30199509a844da3c0f5919dc520"},
+    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:250e888fa62d73e721f3041e3a9abf427788a1934b426b45e1b92f62c1f68366"},
+    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e8f63904df26d1a66aabc141bfd258bf738b9bc7bc6bdef22713b4f5ef789a4c"},
+    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:aac44097d838dda26526cffb63bdd8737a2dbdf5f2c68efb72ad83aec6673c7e"},
+    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:267b24f891e74eccbdff42241c5fb4f974de2d6271dcc7d7e0c9ae1079a560d9"},
+    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6907daa4b9d7a688063ed098c472f96e8181733c525e03e866fb5db480a424df"},
+    {file = "yarl-1.11.1-cp39-cp39-win32.whl", hash = "sha256:14438dfc5015661f75f85bc5adad0743678eefee266ff0c9a8e32969d5d69f74"},
+    {file = "yarl-1.11.1-cp39-cp39-win_amd64.whl", hash = "sha256:94d0caaa912bfcdc702a4204cd5e2bb01eb917fc4f5ea2315aa23962549561b0"},
+    {file = "yarl-1.11.1-py3-none-any.whl", hash = "sha256:72bf26f66456baa0584eff63e44545c9f0eaed9b73cb6601b647c91f14c11f38"},
+    {file = "yarl-1.11.1.tar.gz", hash = "sha256:1bb2d9e212fb7449b8fb73bc461b51eaa17cc8430b4a87d87be7b25052d92f53"},
 ]
 
 [package.dependencies]
@@ -1048,5 +1170,5 @@ multidict = ">=4.0"
 
 [metadata]
 lock-version = "2.0"
-python-versions = ">=3.9,<3.13"
-content-hash = "421fbce065cb1499c666599cf0fd83a5ce8fb3bed09e83c16c3a3d6953b34026"
+python-versions = ">=3.10,<3.13"
+content-hash = "310c0a2349bc0a0713b50f8482b4df4d07c48e0ce2c1bd91c30872f52de3fe1c"
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
index 88e9761a0990df8c3094e40caa51d041391145ac..1949a0b9ed84ce065001f95864f00dd42a81db8d 100644
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@@ -1,14 +1,18 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "2.0.1"
+version = "2.4.0"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <nicolas@huggingface.co>"]
 
 [tool.poetry.dependencies]
 pydantic = "> 2, < 3"
-python = ">=3.9,<3.13"
-syrupy = "4.0.1"
+python = ">=3.10,<3.13"
+syrupy = "^4.7.1"
 text-generation = "^0.6.0"
 pytest = "^7.4.0"
 pytest-asyncio = "^0.21.1"
-docker = "^6.1.3"
+docker = "^7"
+numpy = "^1.20"
+
+[tool.isort]
+profile = "black"
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
index 3c2ce11b68e57e21ce488ccf21399ba4398e65a2..8bf6ba0787a15325904b4f124cfc41a1a93ca4e1 100644
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@@ -1,35 +1,35 @@
-aiohttp==3.8.5 ; python_version >= "3.9" and python_version < "3.13"
-aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
-annotated-types==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
-async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.13"
-attrs==23.1.0 ; python_version >= "3.9" and python_version < "3.13"
-certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-colored==1.4.4 ; python_version >= "3.9" and python_version < "3.13"
-docker==6.1.3 ; python_version >= "3.9" and python_version < "3.13"
-exceptiongroup==1.1.3 ; python_version >= "3.9" and python_version < "3.11"
-filelock==3.12.3 ; python_version >= "3.9" and python_version < "3.13"
-frozenlist==1.4.0 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
-iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "3.13"
-multidict==6.0.4 ; python_version >= "3.9" and python_version < "3.13"
-packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
-pluggy==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
-pydantic-core==2.16.3 ; python_version >= "3.9" and python_version < "3.13"
-pydantic==2.6.4 ; python_version >= "3.9" and python_version < "3.13"
-pytest-asyncio==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
-pytest==7.4.0 ; python_version >= "3.9" and python_version < "3.13"
-pywin32==306 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
-syrupy==4.0.1 ; python_version >= "3.9" and python_version < "3.13"
-text-generation==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
-tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
-websocket-client==1.6.2 ; python_version >= "3.9" and python_version < "3.13"
-yarl==1.9.2 ; python_version >= "3.9" and python_version < "3.13"
+aiohappyeyeballs==2.4.0 ; python_version >= "3.10" and python_version < "3.13"
+aiohttp==3.10.5 ; python_version >= "3.10" and python_version < "3.13"
+aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "3.13"
+annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "3.13"
+async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11"
+attrs==24.2.0 ; python_version >= "3.10" and python_version < "3.13"
+certifi==2024.8.30 ; python_version >= "3.10" and python_version < "3.13"
+charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "3.13"
+colorama==0.4.6 ; python_version >= "3.10" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
+docker==7.1.0 ; python_version >= "3.10" and python_version < "3.13"
+exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11"
+filelock==3.16.0 ; python_version >= "3.10" and python_version < "3.13"
+frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "3.13"
+fsspec==2024.9.0 ; python_version >= "3.10" and python_version < "3.13"
+huggingface-hub==0.24.6 ; python_version >= "3.10" and python_version < "3.13"
+idna==3.8 ; python_version >= "3.10" and python_version < "3.13"
+iniconfig==2.0.0 ; python_version >= "3.10" and python_version < "3.13"
+multidict==6.1.0 ; python_version >= "3.10" and python_version < "3.13"
+numpy==1.26.4 ; python_version >= "3.10" and python_version < "3.13"
+packaging==24.1 ; python_version >= "3.10" and python_version < "3.13"
+pluggy==1.5.0 ; python_version >= "3.10" and python_version < "3.13"
+pydantic-core==2.23.3 ; python_version >= "3.10" and python_version < "3.13"
+pydantic==2.9.1 ; python_version >= "3.10" and python_version < "3.13"
+pytest-asyncio==0.21.2 ; python_version >= "3.10" and python_version < "3.13"
+pytest==7.4.4 ; python_version >= "3.10" and python_version < "3.13"
+pywin32==306 ; python_version >= "3.10" and python_version < "3.13" and sys_platform == "win32"
+pyyaml==6.0.2 ; python_version >= "3.10" and python_version < "3.13"
+requests==2.32.3 ; python_version >= "3.10" and python_version < "3.13"
+syrupy==4.7.1 ; python_version >= "3.10" and python_version < "3.13"
+text-generation==0.6.1 ; python_version >= "3.10" and python_version < "3.13"
+tomli==2.0.1 ; python_version >= "3.10" and python_version < "3.11"
+tqdm==4.66.5 ; python_version >= "3.10" and python_version < "3.13"
+typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "3.13"
+urllib3==2.2.2 ; python_version >= "3.10" and python_version < "3.13"
+yarl==1.11.1 ; python_version >= "3.10" and python_version < "3.13"
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index eb219423eeb58845091d24ea15536d38cd2124b0..fdc3c02c1a1edb2c751c5887c0a7aa03dae0e0e3 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -12,11 +12,13 @@ ctrlc = { version = "3.4.1", features = ["termination"] }
 hf-hub = "0.3.2"
 nix = { version = "0.28.0", features = ["signal"] }
 once_cell = "1.19.0"
+pyo3 = { workspace = true }
 serde = { version = "1.0.188", features = ["derive"] }
 serde_json = "1.0.107"
 thiserror = "1.0.59"
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+regex = "1.11.0"
 
 [dev-dependencies]
 float_eq = "1.0.1"
diff --git a/launcher/src/gpu.rs b/launcher/src/gpu.rs
new file mode 100644
index 0000000000000000000000000000000000000000..b565220e431b5b689bcbc3e32b69173f9be90c51
--- /dev/null
+++ b/launcher/src/gpu.rs
@@ -0,0 +1,21 @@
+pub fn get_cuda_capability() -> Option<(usize, usize)> {
+    use pyo3::prelude::*;
+
+    let py_get_capability = |py: Python| -> PyResult<(isize, isize)> {
+        let torch = py.import_bound("torch.cuda")?;
+        let get_device_capability = torch.getattr("get_device_capability")?;
+        get_device_capability.call0()?.extract()
+    };
+
+    match pyo3::Python::with_gil(py_get_capability) {
+        Ok((major, minor)) if major < 0 || minor < 0 => {
+            tracing::warn!("Ignoring negative GPU compute capabilities: {major}.{minor}");
+            None
+        }
+        Ok((major, minor)) => Some((major as usize, minor as usize)),
+        Err(err) => {
+            tracing::warn!("Cannot determine GPU compute capability: {}", err);
+            None
+        }
+    }
+}
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index e0712af0e28b1279a3f1f707dd1c516f835e64b2..71bbcbd8cd9da6708e19c500c8581db9ccb5e386 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1,11 +1,15 @@
 use clap::{Parser, ValueEnum};
-use hf_hub::{api::sync::Api, Repo, RepoType};
+use hf_hub::{
+    api::sync::{Api, ApiBuilder},
+    Repo, RepoType,
+};
 use nix::sys::signal::{self, Signal};
 use nix::unistd::Pid;
+use regex::Regex;
 use serde::Deserialize;
 use std::env;
 use std::ffi::OsString;
-use std::io::{BufRead, BufReader, Lines};
+use std::io::{BufRead, BufReader};
 use std::os::unix::process::{CommandExt, ExitStatusExt};
 use std::path::Path;
 use std::process::{Child, Command, ExitStatus, Stdio};
@@ -15,22 +19,153 @@ use std::sync::{mpsc, Arc};
 use std::thread;
 use std::thread::sleep;
 use std::time::{Duration, Instant};
-use std::{fs, io};
+use std::{
+    fs, io,
+    io::{Read, Write},
+};
 use thiserror::Error;
 use tracing_subscriber::{filter::LevelFilter, EnvFilter};
 
 mod env_runtime;
+mod gpu;
+
+fn get_config(
+    model_id: &str,
+    revision: &Option<String>,
+) -> Result<Config, Box<dyn std::error::Error>> {
+    let mut path = std::path::Path::new(model_id).to_path_buf();
+    let model_id = model_id.to_string();
+    let filename = if !path.exists() {
+        // Assume it's a hub id
+
+        let api = if let Ok(token) = std::env::var("HF_TOKEN") {
+            // env variable has precedence over on file token.
+            ApiBuilder::new().with_token(Some(token)).build()?
+        } else {
+            Api::new()?
+        };
+        let repo = if let Some(ref revision) = revision {
+            api.repo(Repo::with_revision(
+                model_id,
+                RepoType::Model,
+                revision.to_string(),
+            ))
+        } else {
+            api.model(model_id)
+        };
+        repo.get("config.json")?
+    } else {
+        path.push("config.json");
+        path
+    };
+
+    let content = std::fs::read_to_string(filename)?;
+    let config: RawConfig = serde_json::from_str(&content)?;
+
+    let config: Config = config.into();
+    Ok(config)
+}
+
+fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) -> (String, String) {
+    let compute_capability = gpu::get_cuda_capability();
+    let mut prefix_caching: Option<String> = std::env::var("PREFIX_CACHING").ok();
+    let mut attention: Option<String> = std::env::var("ATTENTION").ok();
+    if let Some(config) = config {
+        if prefix_caching.is_none() {
+            if config.vision_config.is_some() {
+                tracing::info!("Disabling prefix caching because of VLM model");
+                prefix_caching = Some("0".to_string());
+            } else if config.is_encoder_decoder {
+                tracing::info!("Disabling prefix caching because of seq2seq model");
+                prefix_caching = Some("0".to_string());
+            }
+        }
+
+        let fallback_attention = if matches!(compute_capability, Some((major, _)) if major < 8) {
+            "paged"
+        } else {
+            "flashdecoding"
+        };
+
+        match config.head_dim {
+            Some(h) if h == 64 || h == 128 || h == 256 => {
+                if lora_adapters.is_some() && prefix_caching.is_none() {
+                    tracing::info!("Disabling prefix caching because of lora adapters");
+                    prefix_caching = Some("0".to_string());
+                }
+                match config.model_type.as_deref() {
+                    Some("falcon") | Some("deepseek_v2") => {
+                        // Required because gemma2 needs bfloat16 which is not supported by
+                        // flashinfer ?
+                        if attention.is_none() {
+                            tracing::info!(
+                                "Forcing attention to '{fallback_attention}' because model {} requires it",
+                                config.model_type.as_ref().unwrap()
+                            );
+                            attention = Some(fallback_attention.to_string());
+                        }
+                        if fallback_attention == "paged" && prefix_caching.is_none() {
+                            tracing::info!("Disabling prefix caching because it is not supported with 'paged' attention");
+                            prefix_caching = Some("0".to_string());
+                        }
+                    }
+                    Some("t5") => {}
+                    _ => {}
+                }
+            }
+            _ => {
+                if attention.is_none() {
+                    tracing::info!("Forcing attention to '{fallback_attention}' because head dim is not supported by flashinfer, also disabling prefix caching");
+                    attention = Some(fallback_attention.to_string());
+                }
+                if prefix_caching.is_none() {
+                    prefix_caching = Some("0".to_string());
+                }
+            }
+        }
+    }
+    if attention == Some("paged".to_string()) && prefix_caching.is_none() {
+        tracing::info!("Disabling prefix caching on paged attention");
+        prefix_caching = Some("0".to_string());
+    }
+
+    let attention = attention.unwrap_or("flashinfer".to_string());
+    let prefix_caching = prefix_caching.unwrap_or("true".to_string());
+
+    (prefix_caching, attention)
+}
 
 #[derive(Deserialize)]
 struct RawConfig {
     max_position_embeddings: Option<usize>,
     n_positions: Option<usize>,
+    model_type: Option<String>,
     max_seq_len: Option<usize>,
+    quantization_config: Option<QuantizationConfig>,
+    n_embd: Option<usize>,
+    hidden_size: Option<usize>,
+    num_attention_heads: Option<usize>,
+    head_dim: Option<usize>,
+    vision_config: Option<VisionConfig>,
+    is_encoder_decoder: Option<bool>,
+}
+
+#[derive(Deserialize)]
+struct QuantizationConfig {
+    quant_method: Option<Quantization>,
 }
 
+#[derive(Deserialize)]
+struct VisionConfig {}
+
 #[derive(Deserialize)]
 struct Config {
     max_position_embeddings: Option<usize>,
+    quantize: Option<Quantization>,
+    head_dim: Option<usize>,
+    model_type: Option<String>,
+    vision_config: Option<VisionConfig>,
+    is_encoder_decoder: bool,
 }
 
 impl From<RawConfig> for Config {
@@ -39,13 +174,39 @@ impl From<RawConfig> for Config {
             .max_position_embeddings
             .or(other.max_seq_len)
             .or(other.n_positions);
+        let quantize = other.quantization_config.and_then(|q| q.quant_method);
+        let head_dim = other.head_dim.or_else(|| {
+            match (other.hidden_size, other.n_embd, other.num_attention_heads) {
+                (Some(hidden_size), _, Some(num_attention_heads))
+                    if hidden_size % num_attention_heads == 0 =>
+                {
+                    Some(hidden_size / num_attention_heads)
+                }
+                // Legacy
+                (_, Some(hidden_size), Some(num_attention_heads))
+                    if hidden_size % num_attention_heads == 0 =>
+                {
+                    Some(hidden_size / num_attention_heads)
+                }
+                _ => None,
+            }
+        });
+        let model_type = other.model_type;
+        let vision_config = other.vision_config;
+        let is_encoder_decoder = other.is_encoder_decoder.unwrap_or(false);
         Config {
             max_position_embeddings,
+            quantize,
+            head_dim,
+            model_type,
+            vision_config,
+            is_encoder_decoder,
         }
     }
 }
 
-#[derive(Clone, Copy, Debug, ValueEnum)]
+#[derive(Clone, Copy, Debug, ValueEnum, Deserialize)]
+#[serde(rename_all = "kebab-case")]
 enum Quantization {
     /// 4 bit quantization. Requires a specific AWQ quantized model:
     ///   <https://hf.co/models?search=awq>.
@@ -68,17 +229,17 @@ enum Quantization {
     Marlin,
     /// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
     /// but it is known that the model will be much slower to run than the native f16.
-    #[deprecated(
-        since = "1.1.0",
-        note = "Use `eetq` instead, which provides better latencies overall and is drop-in in most cases"
-    )]
+    // #[deprecated(
+    //     since = "1.1.0",
+    //     note = "Use `eetq` instead, which provides better latencies overall and is drop-in in most cases"
+    // )]
     Bitsandbytes,
     /// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
     /// but it is known that the model will be much slower to run than the native f16.
-    BitsandbytesNF4,
+    BitsandbytesNf4,
     /// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
     /// perplexity performance for you model
-    BitsandbytesFP4,
+    BitsandbytesFp4,
     /// [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above
     /// This dtype has native ops should be the fastest if available.
     /// This is currently not the fastest because of local unpacking + padding to satisfy matrix
@@ -95,10 +256,10 @@ impl std::fmt::Display for Quantization {
             Quantization::Bitsandbytes => {
                 write!(f, "bitsandbytes")
             }
-            Quantization::BitsandbytesNF4 => {
+            Quantization::BitsandbytesNf4 => {
                 write!(f, "bitsandbytes-nf4")
             }
-            Quantization::BitsandbytesFP4 => {
+            Quantization::BitsandbytesFp4 => {
                 write!(f, "bitsandbytes-fp4")
             }
             Quantization::Exl2 => {
@@ -144,6 +305,28 @@ impl std::fmt::Display for Dtype {
     }
 }
 
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum KVCacheDtype {
+    #[clap(name = "fp8_e4m3fn")]
+    Fp8e4m3fn,
+
+    #[clap(name = "fp8_e5m2")]
+    Fp8e5m2,
+}
+
+impl std::fmt::Display for KVCacheDtype {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            KVCacheDtype::Fp8e4m3fn => {
+                write!(f, "fp8_e4m3fn")
+            }
+            KVCacheDtype::Fp8e5m2 => {
+                write!(f, "fp8_e5m2")
+            }
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug, ValueEnum)]
 enum RopeScaling {
     Linear,
@@ -164,6 +347,33 @@ impl std::fmt::Display for RopeScaling {
     }
 }
 
+#[derive(Clone, Copy, Debug, ValueEnum)]
+pub enum UsageStatsLevel {
+    /// Default option, usage statistics are collected anonymously
+    On,
+    /// Disables all collection of usage statistics
+    Off,
+    /// Doesn't send the error stack trace or error type, but allows sending a crash event
+    NoStack,
+}
+
+impl std::fmt::Display for UsageStatsLevel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // To keep in track with `server`.
+        match self {
+            UsageStatsLevel::On => {
+                write!(f, "on")
+            }
+            UsageStatsLevel::Off => {
+                write!(f, "off")
+            }
+            UsageStatsLevel::NoStack => {
+                write!(f, "no-stack")
+            }
+        }
+    }
+}
+
 /// App Configuration
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
@@ -199,7 +409,11 @@ struct Args {
     #[clap(long, env)]
     num_shard: Option<usize>,
 
-    /// Whether you want the model to be quantized.
+    /// Quantization method to use for the model. It is not necessary to specify this option
+    /// for pre-quantized models, since the quantization method is read from the model
+    /// configuration.
+    ///
+    /// Marlin kernels will be used automatically for GPTQ/AWQ models.
     #[clap(long, env, value_enum)]
     quantize: Option<Quantization>,
 
@@ -214,6 +428,12 @@ struct Args {
     #[clap(long, env, value_enum)]
     dtype: Option<Dtype>,
 
+    /// Specify the dtype for the key-value cache. When this option is not provided,
+    /// the dtype of the model is used (typically `float16` or `bfloat16`). Currently
+    /// the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA.
+    #[clap(long, env, value_enum)]
+    kv_cache_dtype: Option<KVCacheDtype>,
+
     /// Whether you want to execute hub modelling code. Explicitly passing a `revision` is
     /// encouraged when loading a model with custom code to ensure no malicious code has been
     /// contributed in a newer revision.
@@ -418,6 +638,10 @@ struct Args {
 
     #[clap(long, env)]
     cors_allow_origin: Vec<String>,
+
+    #[clap(long, env)]
+    api_key: Option<String>,
+
     #[clap(long, env)]
     watermark_gamma: Option<f32>,
     #[clap(long, env)]
@@ -457,6 +681,12 @@ struct Args {
     /// startup that will be available to callers via the `adapter_id` field in a request.
     #[clap(long, env)]
     lora_adapters: Option<String>,
+
+    /// Control if anonymous usage stats are collected.
+    /// Options are "on", "off" and "no-stack"
+    /// Defaul is on.
+    #[clap(default_value = "on", long, env)]
+    usage_stats: UsageStatsLevel,
 }
 
 #[derive(Debug)]
@@ -472,6 +702,7 @@ fn shard_manager(
     quantize: Option<Quantization>,
     speculate: Option<usize>,
     dtype: Option<Dtype>,
+    kv_cache_dtype: Option<KVCacheDtype>,
     trust_remote_code: bool,
     uds_path: String,
     rank: usize,
@@ -545,6 +776,11 @@ fn shard_manager(
         shard_args.push(dtype.to_string())
     }
 
+    if let Some(kv_cache_dtype) = kv_cache_dtype {
+        shard_args.push("--kv-cache-dtype".to_string());
+        shard_args.push(kv_cache_dtype.to_string())
+    }
+
     // Model optional revision
     if let Some(revision) = revision {
         shard_args.push("--revision".to_string());
@@ -680,6 +916,7 @@ fn shard_manager(
         .args(shard_args)
         .env_clear()
         .envs(envs)
+        .stdin(Stdio::piped())
         .stdout(Stdio::piped())
         .stderr(Stdio::piped())
         .process_group(0)
@@ -701,12 +938,13 @@ fn shard_manager(
     };
 
     // Redirect STDOUT to the console
+    let mut pstdin = p.stdin.take().unwrap();
     let shard_stdout_reader = BufReader::new(p.stdout.take().unwrap());
     let shard_stderr_reader = BufReader::new(p.stderr.take().unwrap());
 
     //stdout tracing thread
     thread::spawn(move || {
-        log_lines(shard_stdout_reader.lines());
+        log_lines(shard_stdout_reader);
     });
     // We read stderr in another thread as it seems that lines() can block in some cases
     let (err_sender, err_receiver) = mpsc::channel();
@@ -715,6 +953,20 @@ fn shard_manager(
             err_sender.send(line).unwrap_or(());
         }
     });
+    // We read stdin in another thread as it seems that lines() can block in some cases
+    if LevelFilter::current() >= tracing::Level::DEBUG {
+        thread::spawn(move || {
+            let mut stdin = io::stdin(); // We get `Stdin` here.
+            loop {
+                let mut buffer = vec![0; 4096];
+                if let Ok(n) = stdin.read(&mut buffer) {
+                    if n > 0 {
+                        let _ = pstdin.write_all(&buffer[..n]);
+                    }
+                }
+            }
+        });
+    }
 
     let mut ready = false;
     let start_time = Instant::now();
@@ -821,19 +1073,40 @@ impl PythonLogMessage {
     }
 }
 
-impl TryFrom<&String> for PythonLogMessage {
+impl TryFrom<&[u8]> for PythonLogMessage {
     type Error = serde_json::Error;
 
-    fn try_from(value: &String) -> Result<Self, Self::Error> {
-        serde_json::from_str::<Self>(value)
+    fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
+        serde_json::from_slice::<Self>(value)
     }
 }
 
-fn log_lines<S: Sized + BufRead>(lines: Lines<S>) {
-    for line in lines.map_while(Result::ok) {
-        match PythonLogMessage::try_from(&line) {
-            Ok(log) => log.trace(),
-            Err(_) => tracing::debug!("{line}"),
+fn log_lines<R: Sized + Read>(mut bufread: BufReader<R>) {
+    let mut buffer = vec![0u8; 8 * 4096];
+    let mut stdout = std::io::stdout();
+    loop {
+        let n = bufread.read(&mut buffer);
+        if let Ok(n) = n {
+            if n > 0 {
+                let mut lines = buffer[..n].split(|i| *i == b'\n').peekable();
+                while let Some(line) = lines.next() {
+                    match PythonLogMessage::try_from(line) {
+                        Ok(log) => log.trace(),
+                        // For interactive debugging ?
+                        Err(_) => {
+                            if LevelFilter::current() >= tracing::Level::DEBUG {
+                                stdout.write_all(line).unwrap();
+                                if lines.peek().is_some() {
+                                    stdout.write_all(b"\n").unwrap();
+                                }
+                                stdout.flush().unwrap();
+                            }
+                        }
+                    }
+                }
+            } else {
+                break;
+            }
         }
     }
 }
@@ -993,7 +1266,7 @@ fn download_convert_model(
     let download_stdout = BufReader::new(download_process.stdout.take().unwrap());
 
     thread::spawn(move || {
-        log_lines(download_stdout.lines());
+        log_lines(download_stdout);
     });
 
     let download_stderr = BufReader::new(download_process.stderr.take().unwrap());
@@ -1044,6 +1317,7 @@ fn spawn_shards(
     cuda_graphs: Vec<usize>,
     max_total_tokens: usize,
     max_input_tokens: usize,
+    quantize: Option<Quantization>,
     max_log_level: LevelFilter,
     shutdown: Arc<AtomicBool>,
     shutdown_receiver: &mpsc::Receiver<()>,
@@ -1065,9 +1339,9 @@ fn spawn_shards(
         let shutdown_sender = shutdown_sender.clone();
         let otlp_endpoint = args.otlp_endpoint.clone();
         let otlp_service_name = args.otlp_service_name.clone();
-        let quantize = args.quantize;
         let speculate = args.speculate;
         let dtype = args.dtype;
+        let kv_cache_dtype = args.kv_cache_dtype;
         let trust_remote_code = args.trust_remote_code;
         let master_port = args.master_port;
         let disable_custom_kernels = args.disable_custom_kernels;
@@ -1086,6 +1360,7 @@ fn spawn_shards(
                 quantize,
                 speculate,
                 dtype,
+                kv_cache_dtype,
                 trust_remote_code,
                 uds_path,
                 rank,
@@ -1201,6 +1476,10 @@ fn spawn_webserver(
         args.model_id,
     ];
 
+    // Pass usage stats flags to router
+    router_args.push("--usage-stats".to_string());
+    router_args.push(args.usage_stats.to_string());
+
     // Grammar support
     if args.disable_grammar_support {
         router_args.push("--disable-grammar-support".to_string());
@@ -1230,6 +1509,10 @@ fn spawn_webserver(
         router_args.push(revision.to_string())
     }
 
+    if args.trust_remote_code {
+        router_args.push("--trust-remote-code".to_string());
+    }
+
     if args.json_output {
         router_args.push("--json-output".to_string());
     }
@@ -1251,6 +1534,11 @@ fn spawn_webserver(
         router_args.push(origin);
     }
 
+    // API Key
+    if let Some(api_key) = args.api_key {
+        router_args.push("--api-key".to_string());
+        router_args.push(api_key);
+    }
     // Ngrok
     if args.ngrok {
         router_args.push("--ngrok".to_string());
@@ -1379,34 +1667,12 @@ fn main() -> Result<(), LauncherError> {
 
     tracing::info!("{:#?}", args);
 
-    let get_max_position_embeddings = || -> Result<usize, Box<dyn std::error::Error>> {
-        let model_id = args.model_id.clone();
-        let mut path = std::path::Path::new(&args.model_id).to_path_buf();
-        let filename = if !path.exists() {
-            // Assume it's a hub id
-            let api = Api::new()?;
-            let repo = if let Some(ref revision) = args.revision {
-                api.repo(Repo::with_revision(
-                    model_id,
-                    RepoType::Model,
-                    revision.to_string(),
-                ))
-            } else {
-                api.model(model_id)
-            };
-            repo.get("config.json")?
-        } else {
-            path.push("config.json");
-            path
-        };
-
-        let content = std::fs::read_to_string(filename)?;
-        let config: RawConfig = serde_json::from_str(&content)?;
-        let config: Config = config.into();
-
-        // Quantization usually means you're even more RAM constrained.
-        let max_default = 4096;
+    let config: Option<Config> = get_config(&args.model_id, &args.revision).ok();
+    let quantize = config.as_ref().and_then(|c| c.quantize);
+    // Quantization usually means you're even more RAM constrained.
+    let max_default = 4096;
 
+    let max_position_embeddings = if let Some(config) = &config {
         if let Some(max_position_embeddings) = config.max_position_embeddings {
             if max_position_embeddings > max_default {
                 let max = max_position_embeddings;
@@ -1416,17 +1682,20 @@ fn main() -> Result<(), LauncherError> {
                 {
                     tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
                 }
-                Ok(max_default)
+                max_default
             } else {
-                Ok(max_position_embeddings)
+                max_position_embeddings
             }
         } else {
-            Err(Box::new(LauncherError::ArgumentValidation(
-                "no max defined".to_string(),
-            )))
+            max_default
         }
+    } else {
+        max_default
     };
-    let max_position_embeddings: usize = get_max_position_embeddings().unwrap_or(4096);
+    let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters);
+    tracing::info!("Using attention {attention} - Prefix caching {prefix_caching}");
+    std::env::set_var("PREFIX_CACHING", prefix_caching);
+    std::env::set_var("ATTENTION", attention);
 
     let max_input_tokens = {
         match (args.max_input_tokens, args.max_input_length) {
@@ -1476,34 +1745,34 @@ fn main() -> Result<(), LauncherError> {
             "`max_input_tokens must be < `max_total_tokens`".to_string(),
         ));
     }
-    if max_input_tokens as u32 > max_batch_prefill_tokens {
-        return Err(LauncherError::ArgumentValidation(format!(
-            "`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {} and {}",
-            max_batch_prefill_tokens, max_input_tokens
-        )));
-    }
 
-    let cuda_graphs = match (&args.cuda_graphs, &args.quantize) {
+    if matches!(args.quantize, Some(Quantization::Bitsandbytes)) {
+        tracing::warn!("Bitsandbytes is deprecated, use `eetq` instead, which provides better latencies overall and is drop-in in most cases.");
+    }
+    let quantize = args.quantize.or(quantize);
+    let cuda_graphs = match (&args.cuda_graphs, &quantize) {
         (Some(cuda_graphs), _) => cuda_graphs.iter().cloned().filter(|&c| c > 0).collect(),
         #[allow(deprecated)]
         (
             None,
             Some(
                 Quantization::Bitsandbytes
-                | Quantization::BitsandbytesNF4
-                | Quantization::BitsandbytesFP4,
+                | Quantization::BitsandbytesNf4
+                | Quantization::BitsandbytesFp4,
             ),
         ) => {
-            tracing::info!("Bitsandbytes doesn't work with cuda graphs, deactivating them");
+            tracing::warn!("Bitsandbytes doesn't work with cuda graphs, deactivating them");
             vec![]
         }
-        _ => {
-            // let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
-            // tracing::info!("Using default cuda graphs {cuda_graphs:?}");
-            // cuda_graphs
-            tracing::info!("Currently disable cuda graphs by default,may enable in the future");
+        (None, Some(Quantization::Exl2)) => {
+            tracing::warn!("Exl2 doesn't work with cuda graphs, deactivating them");
             vec![]
         }
+        _ => {
+            let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
+            tracing::info!("Using default cuda graphs {cuda_graphs:?}");
+            cuda_graphs
+        }
     };
 
     if args.validation_workers == 0 {
@@ -1529,12 +1798,6 @@ fn main() -> Result<(), LauncherError> {
     }
 
     if let Some(ref max_batch_total_tokens) = args.max_batch_total_tokens {
-        if max_batch_prefill_tokens > *max_batch_total_tokens {
-            return Err(LauncherError::ArgumentValidation(format!(
-                "`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
-                max_batch_prefill_tokens, max_batch_total_tokens
-            )));
-        }
         if max_total_tokens as u32 > *max_batch_total_tokens {
             return Err(LauncherError::ArgumentValidation(format!(
                 "`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
@@ -1578,14 +1841,41 @@ fn main() -> Result<(), LauncherError> {
     // Download and convert lora adapters if any
     if let Some(lora_adapters) = &args.lora_adapters {
         for adapter in lora_adapters.split(',') {
-            download_convert_model(
-                adapter,
-                None,
-                args.trust_remote_code,
-                args.huggingface_hub_cache.as_deref(),
-                args.weights_cache_override.as_deref(),
-                running.clone(),
-            )?;
+            // skip download if a path is provided
+            if adapter.contains('=') {
+                continue;
+            }
+
+            let adapter = adapter.trim();
+
+            // check if adapter has more than 1 '@'
+            if adapter.matches('@').count() > 1 {
+                return Err(LauncherError::ArgumentValidation(format!(
+                    "Invalid LoRA adapter format: {}",
+                    adapter
+                )));
+            }
+
+            // capture adapter_id, path, revision in format of adapter_id=path@revision
+            let re = Regex::new(r"^([^=@]+)(?:=([^@]+))?(?:@(.+))?$").unwrap();
+            if let Some(caps) = re.captures(adapter) {
+                let adapter_id = caps.get(1).map_or("", |m| m.as_str());
+                let revision = caps.get(3).map(|m| m.as_str());
+
+                download_convert_model(
+                    adapter_id,
+                    revision,
+                    args.trust_remote_code,
+                    args.huggingface_hub_cache.as_deref(),
+                    args.weights_cache_override.as_deref(),
+                    running.clone(),
+                )?;
+            } else {
+                return Err(LauncherError::ArgumentValidation(format!(
+                    "Invalid LoRA adapter format: {}",
+                    adapter
+                )));
+            }
         }
     }
 
@@ -1609,6 +1899,7 @@ fn main() -> Result<(), LauncherError> {
         cuda_graphs,
         max_total_tokens,
         max_input_tokens,
+        quantize,
         max_log_level,
         shutdown.clone(),
         &shutdown_receiver,
@@ -1633,9 +1924,8 @@ fn main() -> Result<(), LauncherError> {
         shutdown.clone(),
         &shutdown_receiver,
     )
-    .map_err(|err| {
+    .inspect_err(|_| {
         shutdown_shards(shutdown.clone(), &shutdown_receiver);
-        err
     })?;
 
     // Default exit code
diff --git a/load_tests/common.js b/load_tests/common.js
index e0a105956e126e0e6fe2a33ff5d46902eb84a570..d890bf6710d0849bceb5d5258661e86e5b4dd724 100644
--- a/load_tests/common.js
+++ b/load_tests/common.js
@@ -33,13 +33,13 @@ export function get_options() {
             //     rate: 20,
             //     timeUnit: '1s',
             // },
-            load_test: {
-                executor: 'constant-arrival-rate',
-                duration: '60s',
-                preAllocatedVUs: 100,
-                rate: 1,
-                timeUnit: '1s',
-            },
+            // load_test: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 100,
+            //     rate: 1,
+            //     timeUnit: '1s',
+            // },
             // breakpoint: {
             //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
             //     preAllocatedVUs: 300,
@@ -47,12 +47,12 @@ export function get_options() {
             //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
             //     ],
             // },
-            // throughput: {
-            //     executor: 'shared-iterations',
-            //     vus: 100,
-            //     iterations: 200,
-            //     maxDuration: '40s',
-            // },
+            throughput: {
+                executor: 'shared-iterations',
+                vus: 100,
+                iterations: 200,
+                maxDuration: '40s',
+            },
         },
     };
 }
diff --git a/load_tests/orca.py b/load_tests/orca.py
index e607d27ca427617eb0bb1ebf6fefec308901a338..e445afd5c6f1f261955f9a37d8532cd91841f4c8 100644
--- a/load_tests/orca.py
+++ b/load_tests/orca.py
@@ -20,7 +20,7 @@ def main():
             break
 
     with open("./small.json", "w") as f:
-        data = json.dump(conversations, f, indent=4)
+        json.dump(conversations, f, indent=4)
 
 
 if __name__ == "__main__":
diff --git a/nix/client.nix b/nix/client.nix
new file mode 100644
index 0000000000000000000000000000000000000000..351fd08abb27e75987e8ab6c3cce3009dfe4c578
--- /dev/null
+++ b/nix/client.nix
@@ -0,0 +1,21 @@
+{
+  buildPythonPackage,
+  poetry-core,
+  huggingface-hub,
+  pydantic,
+}:
+
+buildPythonPackage {
+  name = "text-generation";
+
+  src = ../clients/python;
+
+  pyproject = true;
+
+  build-system = [ poetry-core ];
+
+  dependencies = [
+    huggingface-hub
+    pydantic
+  ];
+}
diff --git a/nix/crate-overrides.nix b/nix/crate-overrides.nix
new file mode 100644
index 0000000000000000000000000000000000000000..a4e74c6dbf61600edaad4cb7f53478f0669fd57d
--- /dev/null
+++ b/nix/crate-overrides.nix
@@ -0,0 +1,87 @@
+{ pkgs, nix-filter }:
+
+let
+  filter = nix-filter.lib;
+in
+with pkgs;
+defaultCrateOverrides
+// {
+  aws-lc-rs = attrs: {
+    # aws-lc-rs does its own custom parsing of Cargo environment
+    # variables like DEP_.*_INCLUDE. However buildRustCrate does
+    # not use the version number, so the parsing fails.
+    postPatch = ''
+      substituteInPlace build.rs \
+        --replace-fail \
+        "assert!(!selected.is_empty()" \
+        "// assert!(!selected.is_empty()"
+    '';
+  };
+  rav1e = attrs: { env.CARGO_ENCODED_RUSTFLAGS = "-C target-feature=-crt-static"; };
+
+  grpc-metadata = attrs: {
+    src = filter {
+      root = ../backends/grpc-metadata;
+      include = with filter; [
+        isDirectory
+        (matchExt "rs")
+      ];
+    };
+  };
+  pyo3-build-config = attrs: {
+    buildInputs = [ python3 ];
+  };
+  text-generation-benchmark = attrs: {
+    src = filter {
+      root = ../benchmark;
+      include = with filter; [
+        isDirectory
+        (matchExt "rs")
+      ];
+    };
+  };
+  text-generation-client = attrs: {
+    src = filter {
+      root = ../.;
+      include = with filter; [
+        isDirectory
+        (and (inDirectory "backends/client") (matchExt "rs"))
+        (and (inDirectory "proto") (matchExt "proto"))
+      ];
+    };
+    postPatch = "cd backends/client";
+    buildInputs = [ protobuf ];
+  };
+  text-generation-launcher = attrs: {
+    src = filter {
+      root = ../launcher;
+      include = with filter; [
+        isDirectory
+        (matchExt "rs")
+      ];
+    };
+  };
+  text-generation-router = attrs: {
+    src = filter {
+      root = ../router;
+      include = with filter; [
+        isDirectory
+        (matchExt "rs")
+      ];
+    };
+  };
+  text-generation-router-v3 = attrs: {
+    # We need to do the src/source root dance so that the build
+    # has access to the protobuf file.
+    src = filter {
+      root = ../.;
+      include = with filter; [
+        isDirectory
+        (and (inDirectory "backends/v3") (matchExt "rs"))
+        (and (inDirectory "proto") (matchExt "proto"))
+      ];
+    };
+    postPatch = "cd backends/v3";
+    buildInputs = [ protobuf ];
+  };
+}
diff --git a/nix/docker.nix b/nix/docker.nix
new file mode 100644
index 0000000000000000000000000000000000000000..c4b1d89949186681846f7425aec1bbf3f18808e4
--- /dev/null
+++ b/nix/docker.nix
@@ -0,0 +1,23 @@
+{
+  dockerTools,
+  cacert,
+  text-generation-inference,
+  stream ? false,
+}:
+
+let
+  build = if stream then dockerTools.streamLayeredImage else dockerTools.buildLayeredImage;
+in
+build {
+  name = "tgi-docker";
+  tag = "latest";
+  config = {
+    EntryPoint = [ "${text-generation-inference}/bin/text-generation-inference" ];
+    Env = [
+      "HF_HOME=/data"
+      "PORT=80"
+    ];
+
+  };
+  contents = [ cacert ];
+}
diff --git a/nix/impure-shell.nix b/nix/impure-shell.nix
new file mode 100644
index 0000000000000000000000000000000000000000..92e14bc3cb045ff65b191e6ca03bfd311fff364e
--- /dev/null
+++ b/nix/impure-shell.nix
@@ -0,0 +1,99 @@
+{
+  lib,
+  mkShell,
+  black,
+  cmake,
+  isort,
+  ninja,
+  which,
+  cudaPackages,
+  openssl,
+  pkg-config,
+  protobuf,
+  python3,
+  pyright,
+  redocly,
+  ruff,
+  rust-bin,
+  server,
+
+  # Enable dependencies for building CUDA packages. Useful for e.g.
+  # developing marlin/moe-kernels in-place.
+  withCuda ? false,
+}:
+
+mkShell {
+  nativeBuildInputs =
+    [
+      black
+      isort
+      pkg-config
+      (rust-bin.stable.latest.default.override {
+        extensions = [
+          "rust-analyzer"
+          "rust-src"
+        ];
+      })
+      protobuf
+      pyright
+      redocly
+      ruff
+    ]
+    ++ (lib.optionals withCuda [
+      cmake
+      ninja
+      which
+
+      # For most Torch-based extensions, setting CUDA_HOME is enough, but
+      # some custom CMake builds (e.g. vLLM) also need to have nvcc in PATH.
+      cudaPackages.cuda_nvcc
+    ]);
+  buildInputs =
+    [
+      openssl.dev
+    ]
+    ++ (with python3.pkgs; [
+      venvShellHook
+      docker
+      pip
+      ipdb
+      click
+      pytest
+      pytest-asyncio
+      syrupy
+    ])
+    ++ (lib.optionals withCuda (
+      with cudaPackages;
+      [
+        cuda_cccl
+        cuda_cudart
+        cuda_nvrtc
+        cuda_nvtx
+        cuda_profiler_api
+        cudnn
+        libcublas
+        libcusolver
+        libcusparse
+      ]
+    ));
+
+  inputsFrom = [ server ];
+
+  env = lib.optionalAttrs withCuda {
+    CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
+    TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" python3.pkgs.torch.cudaCapabilities;
+  };
+
+  venvDir = "./.venv";
+
+  postVenvCreation = ''
+    unset SOURCE_DATE_EPOCH
+    ( cd server ; python -m pip install --no-dependencies -e . )
+    ( cd clients/python ; python -m pip install --no-dependencies -e . )
+  '';
+
+  postShellHook = ''
+    unset SOURCE_DATE_EPOCH
+    export PATH=$PATH:~/.cargo/bin
+  '';
+}
diff --git a/nix/overlay.nix b/nix/overlay.nix
new file mode 100644
index 0000000000000000000000000000000000000000..f7b9b8c270e5d049290413a87bd8b84740d4b931
--- /dev/null
+++ b/nix/overlay.nix
@@ -0,0 +1,41 @@
+final: prev: {
+  # You can use this overlay to temporarily override packages for
+  # development. For permanent overrides, it's better to do this in
+  # our package flake:
+  #
+  # https://github.com/huggingface/text-generation-inference-nix
+  #
+  # Note that overriding packages that are in the transitive closure
+  # of many other packages (e.g. transformers) will require a large
+  # rebuild.
+
+  pythonPackagesExtensions = prev.pythonPackagesExtensions ++ [
+    (
+      python-self: python-super: with python-self; {
+        # Python package override example:
+        #   transformers = python-super.transformers.overrideAttrs (
+        #     _: _: {
+        #       src = final.fetchFromGitHub {
+        #         owner = "huggingface";
+        #         repo = "transformers";
+        #         rev = "2bd4d5897dc73e8b172832070a6f9e567a0df017";
+        #         hash = "sha256-JOIpKH9ssDEfI2Tf15e0iPKtThJwQ9GxMvRAnm+M2Pg=";
+        #       };
+        #    }
+        # );
+      }
+    )
+  ];
+
+  # Non-python package override example:
+  #
+  # ripgrep = prev.ripgrep.overrideAttrs (
+  #    _: _: {
+  #      src = final.fetchFromGitHub {
+  #      owner = "BurntSushi";
+  #      repo = "ripgrep";
+  #      rev = "79cbe89deb1151e703f4d91b19af9cdcc128b765";
+  #      hash = "sha256-JPTM2KNmGMb+/jOfK3X7OM1wnN+3TU35SJOIcqmp3mg=";
+  #   };
+  # });
+}
diff --git a/nix/server.nix b/nix/server.nix
new file mode 100644
index 0000000000000000000000000000000000000000..4091554691a4d43679231cce38abae5abb073fe0
--- /dev/null
+++ b/nix/server.nix
@@ -0,0 +1,113 @@
+{
+  nix-filter,
+  buildPythonPackage,
+  poetry-core,
+  mypy-protobuf,
+  awq-inference-engine,
+  causal-conv1d,
+  eetq,
+  einops,
+  exllamav2,
+  flashinfer,
+  flash-attn,
+  flash-attn-layer-norm,
+  flash-attn-rotary,
+  flash-attn-v1,
+  grpc-interceptor,
+  grpcio-reflection,
+  grpcio-status,
+  grpcio-tools,
+  hf-transfer,
+  loguru,
+  mamba-ssm,
+  marlin-kernels,
+  moe-kernels,
+  opentelemetry-api,
+  opentelemetry-exporter-otlp,
+  opentelemetry-instrumentation-grpc,
+  opentelemetry-semantic-conventions,
+  peft,
+  punica-kernels,
+  safetensors,
+  tokenizers,
+  torch,
+  sentencepiece,
+  transformers,
+  typer,
+  vllm,
+}:
+
+let
+  filter = nix-filter.lib;
+in
+buildPythonPackage {
+  name = "text-generation-server";
+
+  src = filter {
+    root = ../.;
+    include = with filter; [
+      isDirectory
+      (and (inDirectory "server") (or_ (matchExt "py") (matchExt "pyi")))
+      "server/pyproject.toml"
+      (and (inDirectory "proto/v3") (matchExt "proto"))
+    ];
+  };
+
+  pyproject = true;
+
+  build-system = [ poetry-core ];
+
+  nativeBuildInputs = [ mypy-protobuf ];
+
+  pythonRelaxDeps = [
+    "einops"
+    "huggingface-hub"
+    "loguru"
+    "opentelemetry-instrumentation-grpc"
+    "sentencepiece"
+    "typer"
+  ];
+
+  pythonRemoveDeps = [ "scipy" ];
+
+  dependencies = [
+    awq-inference-engine
+    eetq
+    causal-conv1d
+    einops
+    exllamav2
+    flashinfer
+    flash-attn
+    flash-attn-layer-norm
+    flash-attn-rotary
+    grpc-interceptor
+    grpcio-reflection
+    grpcio-status
+    grpcio-tools
+    hf-transfer
+    loguru
+    mamba-ssm
+    marlin-kernels
+    moe-kernels
+    opentelemetry-api
+    opentelemetry-exporter-otlp
+    opentelemetry-instrumentation-grpc
+    opentelemetry-semantic-conventions
+    peft
+    punica-kernels
+    safetensors
+    sentencepiece
+    tokenizers
+    transformers
+    typer
+    vllm
+  ];
+
+  prePatch = ''
+    python -m grpc_tools.protoc -Iproto/v3 --python_out=server/text_generation_server/pb \
+           --grpc_python_out=server/text_generation_server/pb --mypy_out=server/text_generation_server/pb proto/v3/generate.proto
+    find server/text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+    touch server/text_generation_server/pb/__init__.py
+    cd server
+  '';
+}
diff --git a/pre_requirements.txt b/pre_requirements.txt
deleted file mode 100644
index 7f18913632a7d779eeae9c4a6906d202805a05d1..0000000000000000000000000000000000000000
--- a/pre_requirements.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-https://download.sourcefind.cn:65024/directlink/4/pytorch/DAS1.3/torch-2.3.0+das.opt2.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-https://download.sourcefind.cn:65024/directlink/4/flash_attn/DAS1.3/dropout_layer_norm-0.1.0+das.opt2.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-https://download.sourcefind.cn:65024/directlink/4/flash_attn/DAS1.3/flash_attn-2.6.1+das.opt2.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-https://download.sourcefind.cn:65024/directlink/4/flash_attn/DAS1.3/fused_dense_lib-0.1.0+das.opt2.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-https://download.sourcefind.cn:65024/directlink/4/flash_attn/DAS1.3/rotary_emb-0.1.0+das.opt2.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-https://download.sourcefind.cn:65024/directlink/4/triton/DAS1.3/triton-2.1.0+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-https://download.sourcefind.cn:65024/directlink/4/xformers/DAS1.3/xformers-0.0.25+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-https://download.sourcefind.cn:65024/directlink/4/lmslim/DAS1.3/lmslim-0.1.1+das.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-https://download.sourcefind.cn:65024/directlink/4/vllm/DAS1.3/vllm-0.5.0+das.opt4.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
-setuptools
-wheel
-outlines==0.0.46
\ No newline at end of file
diff --git a/proto/v3/generate.proto b/proto/v3/generate.proto
index 926c878ea44d9e67ec387612574b99840be7cc32..c91e7cc43b22d28b33de1e8a34afb870c6d31d68 100644
--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
@@ -3,22 +3,23 @@ syntax = "proto3";
 package generate.v3;
 
 service TextGenerationService {
-    /// Model Info
-    rpc Info (InfoRequest) returns (InfoResponse) {}
-    /// Service discovery
-    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
-    /// Empties batch cache
-    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
-    /// Remove requests from a cached batch
-    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
-    /// Warmup the model and compute max cache size
-    rpc Warmup (WarmupRequest) returns (WarmupResponse);
-    /// Prefill batch and decode first token
-    rpc Prefill (PrefillRequest) returns (PrefillResponse);
-    /// Decode token for a list of prefilled batches
-    rpc Decode (DecodeRequest) returns (DecodeResponse);
-    /// Health check
-    rpc Health (HealthRequest) returns (HealthResponse);
+  /// Model Info
+  rpc Info(InfoRequest) returns (InfoResponse) {}
+  /// Service discovery
+  rpc ServiceDiscovery(ServiceDiscoveryRequest)
+      returns (ServiceDiscoveryResponse) {}
+  /// Empties batch cache
+  rpc ClearCache(ClearCacheRequest) returns (ClearCacheResponse);
+  /// Remove requests from a cached batch
+  rpc FilterBatch(FilterBatchRequest) returns (FilterBatchResponse);
+  /// Warmup the model and compute max cache size
+  rpc Warmup(WarmupRequest) returns (WarmupResponse);
+  /// Prefill batch and decode first token
+  rpc Prefill(PrefillRequest) returns (PrefillResponse);
+  /// Decode token for a list of prefilled batches
+  rpc Decode(DecodeRequest) returns (DecodeResponse);
+  /// Health check
+  rpc Health(HealthRequest) returns (HealthResponse);
 }
 
 message HealthRequest {}
@@ -28,240 +29,255 @@ message HealthResponse {}
 message InfoRequest {}
 
 message InfoResponse {
-    bool requires_padding = 1;
-    string dtype = 2;
-    string device_type = 3;
-    optional uint32 window_size = 4;
-    uint32 speculate = 5;
+  bool requires_padding = 1;
+  string dtype = 2;
+  string device_type = 3;
+  optional uint32 window_size = 4;
+  uint32 speculate = 5;
+  bool support_chunking = 6;
+  bool use_prefix_caching = 7;
+  string attention_impl = 8;
+  uint32 block_size = 9;
 }
 
 /// Empty request
 message ServiceDiscoveryRequest {}
 
 message ServiceDiscoveryResponse {
-    /// Other shards urls
-    repeated string urls = 1;
+  /// Other shards urls
+  repeated string urls = 1;
 }
 
 message ClearCacheRequest {
-    /// Optional batch id
-    optional uint64 id = 1;
+  /// Optional batch id
+  optional uint64 id = 1;
 }
 
 /// Empty response
 message ClearCacheResponse {}
 
 message Image {
-    /// Binary image data.
-    bytes data = 1;
+  /// Binary image data.
+  bytes data = 1;
 
-    /// Image MIME type.
-    string mimetype = 2;
+  /// Image MIME type.
+  string mimetype = 2;
 }
 
 message InputChunk {
-    oneof chunk {
-        /// Plain text data
-        string text = 1;
-        /// Image data
-        Image image = 2;
-    }
+  oneof chunk {
+    /// Plain text data
+    string text = 1;
+    /// Image data
+    Image image = 2;
+  }
 }
 
-message Input {
-    repeated InputChunk chunks = 1;
-  }
+message Input { repeated InputChunk chunks = 1; }
 
 enum GrammarType {
-    GRAMMAR_TYPE_NONE = 0;
-    GRAMMAR_TYPE_JSON = 1;
-    GRAMMAR_TYPE_REGEX = 2;
+  GRAMMAR_TYPE_NONE = 0;
+  GRAMMAR_TYPE_JSON = 1;
+  GRAMMAR_TYPE_REGEX = 2;
 }
 
 message NextTokenChooserParameters {
-    /// exponential scaling output probability distribution
-    float temperature = 1;
-    /// restricting to the k highest probability elements
-    uint32 top_k = 2;
-    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
-    float top_p = 3;
-    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
-    float typical_p = 4;
-    /// apply sampling on the logits
-    bool do_sample = 5;
-    /// random seed for sampling
-    uint64 seed = 6;
-    /// repetition penalty
-    float repetition_penalty = 7;
-    /// frequency penalty
-    float frequency_penalty = 9;
-    /// token watermarking using "A Watermark for Large Language Models"
-    bool watermark = 8;
-    /// grammar (applied if not empty)
-    string grammar = 10;
-    /// grammar type
-    GrammarType grammar_type = 11;
+  /// exponential scaling output probability distribution
+  float temperature = 1;
+  /// restricting to the k highest probability elements
+  uint32 top_k = 2;
+  /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+  float top_p = 3;
+  /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+  float typical_p = 4;
+  /// apply sampling on the logits
+  bool do_sample = 5;
+  /// random seed for sampling
+  uint64 seed = 6;
+  /// repetition penalty
+  float repetition_penalty = 7;
+  /// frequency penalty
+  float frequency_penalty = 9;
+  /// token watermarking using "A Watermark for Large Language Models"
+  bool watermark = 8;
+  /// grammar (applied if not empty)
+  string grammar = 10;
+  /// grammar type
+  GrammarType grammar_type = 11;
 }
 
 message StoppingCriteriaParameters {
-    /// Maximum number of generated tokens
-    uint32 max_new_tokens = 1;
-    /// Optional stopping sequences
-    repeated string stop_sequences = 2;
-    /// Ignore end of sequence token
-    /// used for benchmarking
-    bool ignore_eos_token = 3;
+  /// Maximum number of generated tokens
+  uint32 max_new_tokens = 1;
+  /// Optional stopping sequences
+  repeated string stop_sequences = 2;
+  /// Ignore end of sequence token
+  /// used for benchmarking
+  bool ignore_eos_token = 3;
 }
 
 message Request {
-    /// Request ID
-    uint64 id = 1;
-    /// The generation context as chunks
-    Input input_chunks = 8;
-    /// The generation context, stringified input_chunks
-    string inputs = 2;
-    /// Context truncation
-    uint32 truncate = 3;
-    /// Next Token Chooser Parameters
-    NextTokenChooserParameters parameters = 4;
-    /// Stopping Criteria Parameters
-    StoppingCriteriaParameters stopping_parameters = 5;
-    /// Return prefill logprobs
-    bool prefill_logprobs = 6;
-    /// Return most likely n tokens
-    uint32 top_n_tokens = 7;
-    /// Paged attention blocks
-    repeated uint32 blocks = 9;
-    /// Paged attention slots
-    repeated uint32  slots = 10;
-    /// LORA adapter index
-    optional string adapter_id = 11;
+  /// Request ID
+  uint64 id = 1;
+  /// The generation context as chunks
+  Input input_chunks = 8;
+  /// The generation context, stringified input_chunks
+  string inputs = 2;
+  /// Context truncation
+  uint32 truncate = 3;
+  /// Next Token Chooser Parameters
+  NextTokenChooserParameters parameters = 4;
+  /// Stopping Criteria Parameters
+  StoppingCriteriaParameters stopping_parameters = 5;
+  /// Return prefill logprobs
+  bool prefill_logprobs = 6;
+  /// Return most likely n tokens
+  uint32 top_n_tokens = 7;
+  /// Paged attention blocks
+  repeated uint32 blocks = 9;
+  /// Paged attention slots
+  repeated uint32 slots = 10;
+  /// LORA adapter index
+  optional string adapter_id = 11;
+  /// Tokens that can be retrieved from the KV cache.
+  /// This value is set for the first prefill and never reset
+  uint32 cache_len = 12;
+  /// Context truncation
+  bool add_special_tokens = 13;
+  /// Chunk of tokens that must be computed for the first prefill
+  /// This value is set for the first prefill and never reset
+  optional uint32 chunk_len = 14;
 }
 
 message Batch {
-    /// Batch ID
-    uint64 id = 1;
-    /// Individual requests
-    repeated Request requests = 2;
-    /// Batch size (==len(requests))
-    uint32 size = 3;
-    /// Maximum number of tokens this batch will grow to
-    uint32 max_tokens = 4;
-    /// Maximum number of Paged Attention blocks
-    uint32 max_blocks = 5;
+  /// Batch ID
+  uint64 id = 1;
+  /// Individual requests
+  repeated Request requests = 2;
+  /// Batch size (==len(requests))
+  uint32 size = 3;
+  /// Maximum number of tokens this batch will grow to
+  uint32 max_tokens = 4;
+  /// Maximum number of Paged Attention blocks
+  uint32 max_blocks = 5;
 }
 
 message CachedBatch {
-    /// Batch ID
-    uint64 id = 1;
-    /// Individual requests ids
-    repeated uint64 request_ids = 2;
-    /// Batch size (==len(requests))
-    uint32 size = 3;
-    /// Maximum number of tokens this batch will grow to
-    uint32 max_tokens = 4;
+  /// Batch ID
+  uint64 id = 1;
+  /// Individual requests ids
+  repeated uint64 request_ids = 2;
+  /// Batch size (==len(requests))
+  uint32 size = 3;
+  /// Maximum number of tokens this batch will grow to
+  uint32 max_tokens = 4;
+  /// Number of tokens in the next forward
+  uint32 current_tokens = 5;
 }
 
 enum FinishReason {
-    FINISH_REASON_LENGTH = 0;
-    FINISH_REASON_EOS_TOKEN = 1;
-    FINISH_REASON_STOP_SEQUENCE = 2;
+  FINISH_REASON_LENGTH = 0;
+  FINISH_REASON_EOS_TOKEN = 1;
+  FINISH_REASON_STOP_SEQUENCE = 2;
 }
 
 message GeneratedText {
-    /// Output
-    string text = 1;
-    /// Number of generated tokens
-    uint32 generated_tokens = 2;
-    /// Finish reason
-    FinishReason finish_reason = 3;
-    /// Seed
-    optional uint64 seed = 4;
+  /// Output
+  string text = 1;
+  /// Number of generated tokens
+  uint32 generated_tokens = 2;
+  /// Finish reason
+  FinishReason finish_reason = 3;
+  /// Seed
+  optional uint64 seed = 4;
 }
 
 message Tokens {
-    /// Token IDs
-    repeated uint32 ids = 1;
-    /// Logprobs
-    repeated float logprobs = 2;
-    /// tokens
-    repeated string texts = 3;
-    /// special
-    repeated bool is_special = 4;
+  /// Token IDs
+  repeated uint32 ids = 1;
+  /// Logprobs
+  repeated float logprobs = 2;
+  /// tokens
+  repeated string texts = 3;
+  /// special
+  repeated bool is_special = 4;
 }
 
 message Generation {
-    /// Request ID
-    uint64 request_id = 1;
-    /// Prefill tokens (optional)
-    Tokens prefill_tokens = 2;
-    Tokens tokens = 3;
-    /// Complete generated text
-    optional GeneratedText generated_text = 4;
-    /// Top tokens
-    repeated Tokens top_tokens = 5;
+  /// Request ID
+  uint64 request_id = 1;
+  /// Prefill tokens (optional)
+  Tokens prefill_tokens = 2;
+  Tokens tokens = 3;
+  /// Complete generated text
+  optional GeneratedText generated_text = 4;
+  /// Top tokens
+  repeated Tokens top_tokens = 5;
 }
 
 message FilterBatchRequest {
-    /// Batch ID
-    uint64 batch_id = 1;
-    /// Requests to keep
-    repeated uint64 request_ids = 2;
+  /// Batch ID
+  uint64 batch_id = 1;
+  /// Requests to keep
+  repeated uint64 request_ids = 2;
 }
 
 message FilterBatchResponse {
-    /// Filtered Batch (cached)
-    CachedBatch batch = 1;
+  /// Filtered Batch (cached)
+  CachedBatch batch = 1;
 }
 
-
 message PrefillRequest {
-    /// Batch
-    Batch batch = 1;
+  /// Batch
+  Batch batch = 1;
+  /// Optional cached batch
+  CachedBatch cached_batch = 2;
 }
 
 message PrefillResponse {
-    /// Generation
-    repeated Generation generations = 1;
-    /// Next batch (cached)
-    optional CachedBatch batch = 2;
-    /// Forward elapsed time in nanoseconds
-    uint64 forward_ns = 3;
-    /// Decode elapsed time in nanoseconds
-    uint64 decode_ns = 4;
-    /// Total elapsed time in nanoseconds
-    uint64 total_ns = 5;
+  /// Generation
+  repeated Generation generations = 1;
+  /// Next batch (cached)
+  optional CachedBatch batch = 2;
+  /// Forward elapsed time in nanoseconds
+  uint64 forward_ns = 3;
+  /// Decode elapsed time in nanoseconds
+  uint64 decode_ns = 4;
+  /// Total elapsed time in nanoseconds
+  uint64 total_ns = 5;
+  /// Concatenate elapsed time in nanoseconds
+  optional uint64 concat_ns = 6;
 }
 
 message DecodeRequest {
-    /// Cached batches
-    repeated CachedBatch batches = 1;
+  /// Cached batches
+  repeated CachedBatch batches = 1;
 }
 
 message DecodeResponse {
-    /// Decodes
-    repeated Generation generations = 1;
-    /// Next batch (cached)
-    optional CachedBatch batch = 2;
-    /// Forward elapsed time in nanoseconds
-    uint64 forward_ns = 3;
-    /// Decode elapsed time in nanoseconds
-    uint64 decode_ns = 4;
-    /// Total elapsed time in nanoseconds
-    uint64 total_ns = 5;
-    /// Concatenate elapsed time in nanoseconds
-    optional uint64 concat_ns = 6;
+  /// Decodes
+  repeated Generation generations = 1;
+  /// Next batch (cached)
+  optional CachedBatch batch = 2;
+  /// Forward elapsed time in nanoseconds
+  uint64 forward_ns = 3;
+  /// Decode elapsed time in nanoseconds
+  uint64 decode_ns = 4;
+  /// Total elapsed time in nanoseconds
+  uint64 total_ns = 5;
+  /// Concatenate elapsed time in nanoseconds
+  optional uint64 concat_ns = 6;
 }
 
 message WarmupRequest {
-    /// Batch to warmup on
-    Batch batch = 1;
-    uint32 max_input_length = 2;
-    uint32 max_prefill_tokens = 3;
-    uint32 max_total_tokens = 4;
+  /// Batch to warmup on
+  Batch batch = 1;
+  uint32 max_input_length = 2;
+  uint32 max_prefill_tokens = 3;
+  uint32 max_total_tokens = 4;
 }
 
 message WarmupResponse {
-    /// Maximum number of tokens supported by the model
-    optional uint32 max_supported_total_tokens = 1;
+  /// Maximum number of tokens supported by the model
+  optional uint32 max_supported_total_tokens = 1;
 }
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 5855ac86a4a7a7a6734eb395de8fdcae177c4b34..83d85327cfc464e6424cfd3ec12d6283339bef66 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -7,25 +7,18 @@ edition.workspace = true
 authors.workspace = true
 homepage.workspace = true
 
-[lib]
-path = "src/lib.rs"
-
-[[bin]]
-name = "text-generation-router"
-path = "src/main.rs"
-
 [dependencies]
+async-trait = "0.1.74"
 async-stream = "0.3.5"
 axum = { version = "0.7", features = ["json"] }
 axum-tracing-opentelemetry = "0.16"
-text-generation-client = { path = "client" }
 clap = { version = "4.4.5", features = ["derive", "env"] }
 futures = "0.3.28"
 hf-hub = { workspace = true }
 itertools = "0.10"
 jsonschema = { version = "0.17.1", features = ["draft202012"] }
-metrics = "0.21.1"
-metrics-exporter-prometheus = { version = "0.15.1", features = [] }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
 nohash-hasher = "0.2.0"
 opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
 opentelemetry-otlp = "0.13.0"
@@ -34,8 +27,14 @@ reqwest = { version = "0.11.20", features = [] }
 serde = "1.0.188"
 serde_json = "1.0.107"
 thiserror = "1.0.48"
-tokenizers = { workspace = true}
-tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokenizers = { workspace = true }
+tokio = { version = "1.32.0", features = [
+  "rt",
+  "rt-multi-thread",
+  "parking_lot",
+  "signal",
+  "sync",
+] }
 tokio-stream = "0.1.14"
 tower-http = { version = "0.5.1", features = ["cors"] }
 tracing = "0.1.40"
@@ -44,14 +43,26 @@ tracing-subscriber = { version = "0.3.18", features = ["json", "env-filter"] }
 utoipa = { version = "4.2.0", features = ["axum_extras"] }
 utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
 ngrok = { version = "0.13.1", features = ["axum"], optional = true }
-init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }
-minijinja = { version = "2.0.2" }
-minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+init-tracing-opentelemetry = { version = "0.14.1", features = [
+  "opentelemetry-otlp",
+] }
+minijinja = { workspace = true }
+minijinja-contrib = { workspace = true }
 futures-util = "0.3.30"
 regex = "1.10.3"
 once_cell = "1.19.0"
 image = "0.25.1"
 base64 = { workspace = true }
+sysinfo = "0.30.13"
+uuid = { version = "1.9.1", default-features = false, features = [
+  "v4",
+  "fast-rng",
+  "macro-diagnostics",
+] }
+csv = "1.3.0"
+ureq = "=2.9"
+pyo3 = { workspace = true }
+
 
 [build-dependencies]
 vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] }
diff --git a/router/client/src/v2/pb/generate.v2.rs b/router/client/src/v2/pb/generate.v2.rs
deleted file mode 100644
index 1a2063604bcc94a82d0321bcc9308be558c1c183..0000000000000000000000000000000000000000
--- a/router/client/src/v2/pb/generate.v2.rs
+++ /dev/null
@@ -1,647 +0,0 @@
-// This file is @generated by prost-build.
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct HealthRequest {}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct HealthResponse {}
-/// / Empty request
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct InfoRequest {}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct InfoResponse {
-    #[prost(bool, tag = "1")]
-    pub requires_padding: bool,
-    #[prost(string, tag = "2")]
-    pub dtype: ::prost::alloc::string::String,
-    #[prost(string, tag = "3")]
-    pub device_type: ::prost::alloc::string::String,
-    #[prost(uint32, optional, tag = "4")]
-    pub window_size: ::core::option::Option<u32>,
-    #[prost(uint32, tag = "5")]
-    pub speculate: u32,
-}
-/// / Empty request
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct ServiceDiscoveryRequest {}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct ServiceDiscoveryResponse {
-    /// / Other shards urls
-    #[prost(string, repeated, tag = "1")]
-    pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct ClearCacheRequest {
-    /// / Optional batch id
-    #[prost(uint64, optional, tag = "1")]
-    pub id: ::core::option::Option<u64>,
-}
-/// / Empty response
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct ClearCacheResponse {}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct NextTokenChooserParameters {
-    /// / exponential scaling output probability distribution
-    #[prost(float, tag = "1")]
-    pub temperature: f32,
-    /// / restricting to the k highest probability elements
-    #[prost(uint32, tag = "2")]
-    pub top_k: u32,
-    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
-    #[prost(float, tag = "3")]
-    pub top_p: f32,
-    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
-    #[prost(float, tag = "4")]
-    pub typical_p: f32,
-    /// / apply sampling on the logits
-    #[prost(bool, tag = "5")]
-    pub do_sample: bool,
-    /// / random seed for sampling
-    #[prost(uint64, tag = "6")]
-    pub seed: u64,
-    /// / repetition penalty
-    #[prost(float, tag = "7")]
-    pub repetition_penalty: f32,
-    /// / frequency penalty
-    #[prost(float, tag = "9")]
-    pub frequency_penalty: f32,
-    /// / token watermarking using "A Watermark for Large Language Models"
-    #[prost(bool, tag = "8")]
-    pub watermark: bool,
-    /// / grammar (applied if not empty)
-    #[prost(string, tag = "10")]
-    pub grammar: ::prost::alloc::string::String,
-    /// / grammar type
-    #[prost(enumeration = "GrammarType", tag = "11")]
-    pub grammar_type: i32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct StoppingCriteriaParameters {
-    /// / Maximum number of generated tokens
-    #[prost(uint32, tag = "1")]
-    pub max_new_tokens: u32,
-    /// / Optional stopping sequences
-    #[prost(string, repeated, tag = "2")]
-    pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
-    /// / Ignore end of sequence token
-    /// / used for benchmarking
-    #[prost(bool, tag = "3")]
-    pub ignore_eos_token: bool,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct Request {
-    /// / Request ID
-    #[prost(uint64, tag = "1")]
-    pub id: u64,
-    /// / The generation context
-    #[prost(string, tag = "2")]
-    pub inputs: ::prost::alloc::string::String,
-    /// / Context truncation
-    #[prost(uint32, tag = "3")]
-    pub truncate: u32,
-    /// / Next Token Chooser Parameters
-    #[prost(message, optional, tag = "4")]
-    pub parameters: ::core::option::Option<NextTokenChooserParameters>,
-    /// / Stopping Criteria Parameters
-    #[prost(message, optional, tag = "5")]
-    pub stopping_parameters: ::core::option::Option<StoppingCriteriaParameters>,
-    /// / Return prefill logprobs
-    #[prost(bool, tag = "6")]
-    pub prefill_logprobs: bool,
-    /// / Return most likely n tokens
-    #[prost(uint32, tag = "7")]
-    pub top_n_tokens: u32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct Batch {
-    /// / Batch ID
-    #[prost(uint64, tag = "1")]
-    pub id: u64,
-    /// / Individual requests
-    #[prost(message, repeated, tag = "2")]
-    pub requests: ::prost::alloc::vec::Vec<Request>,
-    /// / Batch size (==len(requests))
-    #[prost(uint32, tag = "3")]
-    pub size: u32,
-    /// / Maximum number of tokens this batch will grow to
-    #[prost(uint32, tag = "4")]
-    pub max_tokens: u32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct CachedBatch {
-    /// / Batch ID
-    #[prost(uint64, tag = "1")]
-    pub id: u64,
-    /// / Individual requests ids
-    #[prost(uint64, repeated, tag = "2")]
-    pub request_ids: ::prost::alloc::vec::Vec<u64>,
-    /// / Batch size (==len(requests))
-    #[prost(uint32, tag = "3")]
-    pub size: u32,
-    /// / Maximum number of tokens this batch will grow to
-    #[prost(uint32, tag = "4")]
-    pub max_tokens: u32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct GeneratedText {
-    /// / Output
-    #[prost(string, tag = "1")]
-    pub text: ::prost::alloc::string::String,
-    /// / Number of generated tokens
-    #[prost(uint32, tag = "2")]
-    pub generated_tokens: u32,
-    /// / Finish reason
-    #[prost(enumeration = "FinishReason", tag = "3")]
-    pub finish_reason: i32,
-    /// / Seed
-    #[prost(uint64, optional, tag = "4")]
-    pub seed: ::core::option::Option<u64>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct Tokens {
-    /// / Token IDs
-    #[prost(uint32, repeated, tag = "1")]
-    pub ids: ::prost::alloc::vec::Vec<u32>,
-    /// / Logprobs
-    #[prost(float, repeated, tag = "2")]
-    pub logprobs: ::prost::alloc::vec::Vec<f32>,
-    /// / tokens
-    #[prost(string, repeated, tag = "3")]
-    pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
-    /// / special
-    #[prost(bool, repeated, tag = "4")]
-    pub is_special: ::prost::alloc::vec::Vec<bool>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct Generation {
-    /// / Request ID
-    #[prost(uint64, tag = "1")]
-    pub request_id: u64,
-    /// / Prefill tokens (optional)
-    #[prost(message, optional, tag = "2")]
-    pub prefill_tokens: ::core::option::Option<Tokens>,
-    #[prost(message, optional, tag = "3")]
-    pub tokens: ::core::option::Option<Tokens>,
-    /// / Complete generated text
-    #[prost(message, optional, tag = "4")]
-    pub generated_text: ::core::option::Option<GeneratedText>,
-    /// / Top tokens
-    #[prost(message, repeated, tag = "5")]
-    pub top_tokens: ::prost::alloc::vec::Vec<Tokens>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct FilterBatchRequest {
-    /// / Batch ID
-    #[prost(uint64, tag = "1")]
-    pub batch_id: u64,
-    /// / Requests to keep
-    #[prost(uint64, repeated, tag = "2")]
-    pub request_ids: ::prost::alloc::vec::Vec<u64>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct FilterBatchResponse {
-    /// / Filtered Batch (cached)
-    #[prost(message, optional, tag = "1")]
-    pub batch: ::core::option::Option<CachedBatch>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct PrefillRequest {
-    /// / Batch
-    #[prost(message, optional, tag = "1")]
-    pub batch: ::core::option::Option<Batch>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct PrefillResponse {
-    /// / Generation
-    #[prost(message, repeated, tag = "1")]
-    pub generations: ::prost::alloc::vec::Vec<Generation>,
-    /// / Next batch (cached)
-    #[prost(message, optional, tag = "2")]
-    pub batch: ::core::option::Option<CachedBatch>,
-    /// / Forward elapsed time in nanoseconds
-    #[prost(uint64, tag = "3")]
-    pub forward_ns: u64,
-    /// / Decode elapsed time in nanoseconds
-    #[prost(uint64, tag = "4")]
-    pub decode_ns: u64,
-    /// / Total elapsed time in nanoseconds
-    #[prost(uint64, tag = "5")]
-    pub total_ns: u64,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct DecodeRequest {
-    /// / Cached batches
-    #[prost(message, repeated, tag = "1")]
-    pub batches: ::prost::alloc::vec::Vec<CachedBatch>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct DecodeResponse {
-    /// / Decodes
-    #[prost(message, repeated, tag = "1")]
-    pub generations: ::prost::alloc::vec::Vec<Generation>,
-    /// / Next batch (cached)
-    #[prost(message, optional, tag = "2")]
-    pub batch: ::core::option::Option<CachedBatch>,
-    /// / Forward elapsed time in nanoseconds
-    #[prost(uint64, tag = "3")]
-    pub forward_ns: u64,
-    /// / Decode elapsed time in nanoseconds
-    #[prost(uint64, tag = "4")]
-    pub decode_ns: u64,
-    /// / Total elapsed time in nanoseconds
-    #[prost(uint64, tag = "5")]
-    pub total_ns: u64,
-    /// / Concatenate elapsed time in nanoseconds
-    #[prost(uint64, optional, tag = "6")]
-    pub concat_ns: ::core::option::Option<u64>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct WarmupRequest {
-    /// / Batch to warmup on
-    #[prost(message, optional, tag = "1")]
-    pub batch: ::core::option::Option<Batch>,
-    #[prost(uint32, tag = "2")]
-    pub max_input_length: u32,
-    #[prost(uint32, tag = "3")]
-    pub max_prefill_tokens: u32,
-    #[prost(uint32, tag = "4")]
-    pub max_total_tokens: u32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct WarmupResponse {
-    /// / Maximum number of tokens supported by the model
-    #[prost(uint32, optional, tag = "1")]
-    pub max_supported_total_tokens: ::core::option::Option<u32>,
-}
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
-#[repr(i32)]
-pub enum GrammarType {
-    None = 0,
-    Json = 1,
-    Regex = 2,
-}
-impl GrammarType {
-    /// String value of the enum field names used in the ProtoBuf definition.
-    ///
-    /// The values are not transformed in any way and thus are considered stable
-    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
-    pub fn as_str_name(&self) -> &'static str {
-        match self {
-            GrammarType::None => "GRAMMAR_TYPE_NONE",
-            GrammarType::Json => "GRAMMAR_TYPE_JSON",
-            GrammarType::Regex => "GRAMMAR_TYPE_REGEX",
-        }
-    }
-    /// Creates an enum from field names used in the ProtoBuf definition.
-    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
-        match value {
-            "GRAMMAR_TYPE_NONE" => Some(Self::None),
-            "GRAMMAR_TYPE_JSON" => Some(Self::Json),
-            "GRAMMAR_TYPE_REGEX" => Some(Self::Regex),
-            _ => None,
-        }
-    }
-}
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
-#[repr(i32)]
-pub enum FinishReason {
-    Length = 0,
-    EosToken = 1,
-    StopSequence = 2,
-}
-impl FinishReason {
-    /// String value of the enum field names used in the ProtoBuf definition.
-    ///
-    /// The values are not transformed in any way and thus are considered stable
-    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
-    pub fn as_str_name(&self) -> &'static str {
-        match self {
-            FinishReason::Length => "FINISH_REASON_LENGTH",
-            FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN",
-            FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE",
-        }
-    }
-    /// Creates an enum from field names used in the ProtoBuf definition.
-    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
-        match value {
-            "FINISH_REASON_LENGTH" => Some(Self::Length),
-            "FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken),
-            "FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence),
-            _ => None,
-        }
-    }
-}
-/// Generated client implementations.
-pub mod text_generation_service_client {
-    #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
-    use tonic::codegen::*;
-    use tonic::codegen::http::Uri;
-    #[derive(Debug, Clone)]
-    pub struct TextGenerationServiceClient<T> {
-        inner: tonic::client::Grpc<T>,
-    }
-    impl TextGenerationServiceClient<tonic::transport::Channel> {
-        /// Attempt to create a new client by connecting to a given endpoint.
-        pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
-        where
-            D: TryInto<tonic::transport::Endpoint>,
-            D::Error: Into<StdError>,
-        {
-            let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
-            Ok(Self::new(conn))
-        }
-    }
-    impl<T> TextGenerationServiceClient<T>
-    where
-        T: tonic::client::GrpcService<tonic::body::BoxBody>,
-        T::Error: Into<StdError>,
-        T::ResponseBody: Body<Data = Bytes> + Send + 'static,
-        <T::ResponseBody as Body>::Error: Into<StdError> + Send,
-    {
-        pub fn new(inner: T) -> Self {
-            let inner = tonic::client::Grpc::new(inner);
-            Self { inner }
-        }
-        pub fn with_origin(inner: T, origin: Uri) -> Self {
-            let inner = tonic::client::Grpc::with_origin(inner, origin);
-            Self { inner }
-        }
-        pub fn with_interceptor<F>(
-            inner: T,
-            interceptor: F,
-        ) -> TextGenerationServiceClient<InterceptedService<T, F>>
-        where
-            F: tonic::service::Interceptor,
-            T::ResponseBody: Default,
-            T: tonic::codegen::Service<
-                http::Request<tonic::body::BoxBody>,
-                Response = http::Response<
-                    <T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
-                >,
-            >,
-            <T as tonic::codegen::Service<
-                http::Request<tonic::body::BoxBody>,
-            >>::Error: Into<StdError> + Send + Sync,
-        {
-            TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor))
-        }
-        /// Compress requests with the given encoding.
-        ///
-        /// This requires the server to support it otherwise it might respond with an
-        /// error.
-        #[must_use]
-        pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
-            self.inner = self.inner.send_compressed(encoding);
-            self
-        }
-        /// Enable decompressing responses.
-        #[must_use]
-        pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
-            self.inner = self.inner.accept_compressed(encoding);
-            self
-        }
-        /// Limits the maximum size of a decoded message.
-        ///
-        /// Default: `4MB`
-        #[must_use]
-        pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
-            self.inner = self.inner.max_decoding_message_size(limit);
-            self
-        }
-        /// Limits the maximum size of an encoded message.
-        ///
-        /// Default: `usize::MAX`
-        #[must_use]
-        pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
-            self.inner = self.inner.max_encoding_message_size(limit);
-            self
-        }
-        /// / Model Info
-        pub async fn info(
-            &mut self,
-            request: impl tonic::IntoRequest<super::InfoRequest>,
-        ) -> std::result::Result<tonic::Response<super::InfoResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Info",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Info"));
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Service discovery
-        pub async fn service_discovery(
-            &mut self,
-            request: impl tonic::IntoRequest<super::ServiceDiscoveryRequest>,
-        ) -> std::result::Result<
-            tonic::Response<super::ServiceDiscoveryResponse>,
-            tonic::Status,
-        > {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/ServiceDiscovery",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(
-                    GrpcMethod::new(
-                        "generate.v2.TextGenerationService",
-                        "ServiceDiscovery",
-                    ),
-                );
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Empties batch cache
-        pub async fn clear_cache(
-            &mut self,
-            request: impl tonic::IntoRequest<super::ClearCacheRequest>,
-        ) -> std::result::Result<
-            tonic::Response<super::ClearCacheResponse>,
-            tonic::Status,
-        > {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/ClearCache",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(
-                    GrpcMethod::new("generate.v2.TextGenerationService", "ClearCache"),
-                );
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Remove requests from a cached batch
-        pub async fn filter_batch(
-            &mut self,
-            request: impl tonic::IntoRequest<super::FilterBatchRequest>,
-        ) -> std::result::Result<
-            tonic::Response<super::FilterBatchResponse>,
-            tonic::Status,
-        > {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/FilterBatch",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(
-                    GrpcMethod::new("generate.v2.TextGenerationService", "FilterBatch"),
-                );
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Warmup the model and compute max cache size
-        pub async fn warmup(
-            &mut self,
-            request: impl tonic::IntoRequest<super::WarmupRequest>,
-        ) -> std::result::Result<tonic::Response<super::WarmupResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Warmup",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Warmup"));
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Prefill batch and decode first token
-        pub async fn prefill(
-            &mut self,
-            request: impl tonic::IntoRequest<super::PrefillRequest>,
-        ) -> std::result::Result<
-            tonic::Response<super::PrefillResponse>,
-            tonic::Status,
-        > {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Prefill",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Prefill"));
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Decode token for a list of prefilled batches
-        pub async fn decode(
-            &mut self,
-            request: impl tonic::IntoRequest<super::DecodeRequest>,
-        ) -> std::result::Result<tonic::Response<super::DecodeResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Decode",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Decode"));
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Health check
-        pub async fn health(
-            &mut self,
-            request: impl tonic::IntoRequest<super::HealthRequest>,
-        ) -> std::result::Result<tonic::Response<super::HealthResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Health",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Health"));
-            self.inner.unary(req, path, codec).await
-        }
-    }
-}
diff --git a/router/client/src/v2/pb/mod.rs b/router/client/src/v2/pb/mod.rs
deleted file mode 100644
index 095ead1fa24bb11a39934f485175a452bdfb412c..0000000000000000000000000000000000000000
--- a/router/client/src/v2/pb/mod.rs
+++ /dev/null
@@ -1,6 +0,0 @@
-// This file is @generated by prost-build.
-pub mod generate {
-    pub mod v2 {
-        include!("generate.v2.rs");
-    }
-}
diff --git a/router/src/config.rs b/router/src/config.rs
index 7737165e406ca173c8dec2cb322ec3af4383a659..7139b9237687a03779d467597db9f32f3e7df45e 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -146,18 +146,22 @@ pub enum Config {
     ClipVisionModel(ClipVisionModel),
     Mistral,
     Idefics,
+    Mllama,
     Idefics2(Idefics2),
     Ssm,
     GptBigcode,
+    Granite,
     Santacoder,
     Bloom,
     Mpt,
     Gpt2,
+    Gptj,
     GptNeox,
     Phi,
     #[serde(rename = "phi-msft")]
     PhiMsft,
     Phi3,
+    PhiMoe,
     Llama,
     Baichuan,
     Paligemma(Paligemma),
diff --git a/router/src/infer/v3/scheduler.rs b/router/src/infer/chat_template.rs
similarity index 67%
rename from router/src/infer/v3/scheduler.rs
rename to router/src/infer/chat_template.rs
index 543ce89f823c865b43ed67a9de2a86fce9e50148..1071d0bac2041f2a226e32fea2db8d2025a14fd5 100644
--- a/router/src/infer/v3/scheduler.rs
+++ b/router/src/infer/chat_template.rs
@@ -1,518 +1,111 @@
-/// Batching and inference logic
-use crate::infer::v3::queue::{Entry, Queue};
-use crate::infer::{
-    GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse, Scheduler,
-};
-use crate::validation::ValidGenerateRequest;
-use crate::{FinishReason, PrefillToken, Token};
-use nohash_hasher::IntMap;
-use std::sync::{
-    atomic::{AtomicBool, Ordering},
-    Arc,
-};
-use text_generation_client::v3::{Batch, CachedBatch, Generation, ShardedClient};
-use text_generation_client::ClientError;
-use tokio::sync::mpsc::error::SendError;
-use tokio::sync::{mpsc, Notify, OwnedSemaphorePermit};
-use tokio::time::Instant;
-use tokio_stream::wrappers::UnboundedReceiverStream;
-use tracing::{info_span, instrument, Instrument, Span};
+use crate::infer::InferError;
+use crate::{ChatTemplateInputs, Message, MessageChunk, TextMessage, TokenizerConfigToken, Tool};
+use minijinja::{Environment, ErrorKind, Template};
+use minijinja_contrib::pycompat;
+use std::collections::HashSet;
+
+/// Raise a exception (custom function) used in the chat templates
+pub(crate) fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
+    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
+}
 
-pub(crate) struct SchedulerV3 {
-    /// Request queue
-    queue: Queue,
-    /// Notify batcher on queue appends
-    batching_task_notifier: Arc<Notify>,
+#[derive(Clone)]
+pub(crate) struct ChatTemplate {
+    template: Template<'static, 'static>,
+    bos_token: Option<String>,
+    eos_token: Option<String>,
+    use_default_tool_template: bool,
+    variables: HashSet<String>,
 }
 
-impl SchedulerV3 {
-    #[allow(clippy::too_many_arguments)]
+impl ChatTemplate {
     pub(crate) fn new(
-        client: ShardedClient,
-        waiting_served_ratio: f32,
-        max_batch_prefill_tokens: u32,
-        max_batch_total_tokens: u32,
-        max_waiting_tokens: usize,
-        max_batch_size: Option<usize>,
-        requires_padding: bool,
-        window_size: Option<u32>,
-        speculate: u32,
-        generation_health: Arc<AtomicBool>,
+        template: String,
+        bos_token: Option<TokenizerConfigToken>,
+        eos_token: Option<TokenizerConfigToken>,
     ) -> Self {
-        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
-            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
-        } else {
-            false
-        };
-        let block_size = if flashdecoding { 256 } else { 16 };
-        let queue = Queue::new(
-            requires_padding,
-            block_size,
-            window_size,
-            speculate,
-            max_batch_total_tokens,
-        );
-        let batching_task_notifier = Arc::new(Notify::new());
+        let mut env = Box::new(Environment::new());
+        // enable things like .strip() or .capitalize()
+        env.set_unknown_method_callback(pycompat::unknown_method_callback);
+        let template_str = template.into_boxed_str();
+        env.add_function("raise_exception", raise_exception);
+        tracing::debug!("Loading template: {}", template_str);
 
-        // Spawn batching background task that contains all the inference logic
-        tokio::spawn(batching_task(
-            client,
-            waiting_served_ratio,
-            max_batch_prefill_tokens,
-            max_batch_total_tokens,
-            max_waiting_tokens,
-            max_batch_size,
-            queue.clone(),
-            batching_task_notifier.clone(),
-            generation_health,
-        ));
+        // leaking env and template_str as read-only, static resources for performance.
+        let template = Box::leak(env)
+            .template_from_str(Box::leak(template_str))
+            .unwrap();
+
+        // get the list of variables that are used in the template
+        let variables = template.undeclared_variables(true);
+        // check if the `tools` variable is used in the template
+        let use_default_tool_template = !variables.contains("tools");
+        tracing::debug!("Use default tool template: {}", use_default_tool_template);
 
         Self {
-            queue,
-            batching_task_notifier,
+            template,
+            bos_token: bos_token.map(|token| token.as_str().to_string()),
+            eos_token: eos_token.map(|token| token.as_str().to_string()),
+            use_default_tool_template,
+            variables,
         }
     }
-}
 
-impl Scheduler for SchedulerV3 {
-    #[instrument(skip_all)]
-    fn schedule(
+    pub(crate) fn apply(
         &self,
-        request: ValidGenerateRequest,
-        permit: OwnedSemaphorePermit,
-    ) -> Result<GenerateStreamResponse, InferError> {
-        // MPSC channel to communicate with the background batching task
-        let (response_tx, response_rx) = mpsc::unbounded_channel();
-        let input_length = request.input_length;
-
-        // Append the request to the queue
-        self.queue.append(Entry {
-            request,
-            response_tx,
-            span: Span::current(),
-            temp_span: None,
-            queue_time: Instant::now(),
-            batch_time: None,
-            block_allocation: None,
-        });
-
-        // Notify the background task that we have a new entry in the queue that needs
-        // to be batched
-        self.batching_task_notifier.notify_one();
-
-        // Return stream
-        Ok((
-            permit,
-            input_length,
-            UnboundedReceiverStream::new(response_rx),
-        ))
-    }
-}
-
-/// Batching logic
-/// Will be launched in a background Tokio task
-///
-/// Batches requests and sends them to the inference server
-#[allow(clippy::too_many_arguments)]
-pub(crate) async fn batching_task(
-    mut client: ShardedClient,
-    waiting_served_ratio: f32,
-    max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: u32,
-    max_waiting_tokens: usize,
-    max_batch_size: Option<usize>,
-    queue: Queue,
-    notifier: Arc<Notify>,
-    generation_health: Arc<AtomicBool>,
-) {
-    // Infinite loop
-    loop {
-        // Wait for a notification from the Infer struct
-        notifier.notified().await;
-
-        // Get the next batch from the queue
-        // This batch might be smaller than the maximum batch size if there are not enough requests
-        // waiting in the queue
-        while let Some((mut entries, batch, span)) = queue
-            .next_batch(
-                None,
-                max_batch_size,
-                max_batch_prefill_tokens,
-                max_batch_total_tokens,
-            )
-            .await
-        {
-            let mut cached_batch = prefill(&mut client, batch, &mut entries, &generation_health)
-                .instrument(span)
-                .await;
-            let mut waiting_tokens = 1;
-
-            // We loop until we do not receive any cached batch from the inference server (== until
-            // all requests have met their stopping criteria)
-            while let Some(batch) = cached_batch {
-                // Get current batch info
-                let batch_size = batch.size;
-                let batch_max_tokens = batch.max_tokens;
-                let mut batches = vec![batch];
-                metrics::gauge!("tgi_batch_current_size", batch_size as f64);
-                metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64);
+        guideline: Option<&str>,
+        mut messages: Vec<Message>,
+        tools_and_prompt: Option<(Vec<Tool>, String)>,
+    ) -> Result<String, InferError> {
+        // check if guideline is expected but not provided
+        if self.variables.contains("guideline") && guideline.is_none() {
+            return Err(InferError::MissingTemplateVariable("guideline".to_string()));
+        }
 
-                let min_size = if waiting_tokens >= max_waiting_tokens {
-                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
-                    // to add a new batch even though its size might be small
-                    None
+        let tools = match tools_and_prompt {
+            Some((tools, tool_prompt)) => {
+                // check if the `tools` variable is used in the template
+                // if not, we need to append the tools to the last message
+                let text = if self.use_default_tool_template {
+                    match serde_json::to_string(&tools) {
+                        Ok(tools_str) => format!("\n---\n{}\n{}", tools_str, tool_prompt),
+                        Err(e) => return Err(InferError::ToolError(e.to_string())),
+                    }
                 } else {
-                    // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    // if the `tools` variable is used in the template, we just append the tool_prompt
+                    format!("\n---\n{}", tool_prompt)
                 };
-
-                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
-                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
-
-                // Try to get a new batch
-                if let Some((mut new_entries, new_batch, span)) = queue
-                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
-                    .await
-                {
-                    // Tracking metrics
-                    if min_size.is_some() {
-                        metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure");
-                    } else {
-                        metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded");
-                    }
-
-                    entries.iter_mut().for_each(|(_, entry)| {
-                        // Create a new span to add the info that this entry is waiting
-                        // because a new batch is being computed
-                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
-                        // Add relationships
-                        span.follows_from(&entry_waiting_span);
-                        entry_waiting_span.follows_from(&span);
-                        // Update entry
-                        entry.temp_span = Some(entry_waiting_span);
-                    });
-
-                    // Generate one token for this new batch to have the attention past in cache
-                    let new_cached_batch =
-                        prefill(&mut client, new_batch, &mut new_entries, &generation_health)
-                            .instrument(span)
-                            .await;
-                    // Reset waiting counter
-                    waiting_tokens = 1;
-                    // Extend current batch with the new batch
-                    if let Some(new_cached_batch) = new_cached_batch {
-                        entries.extend(new_entries);
-                        batches.push(new_cached_batch);
-                    }
+                if let Some(last_message) = messages.last_mut() {
+                    last_message.content.push(MessageChunk::Text { text });
                 }
-
-                // Create span for this batch to add context to inference calls
-                let next_batch_size = entries.len();
-                let next_batch_span =
-                    info_span!(parent: None, "batch", batch_size = next_batch_size);
-                entries.iter_mut().for_each(|(_, entry)| {
-                    // Create a new span to link the batch back to this entry
-                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
-                    // Add relationships
-                    next_batch_span.follows_from(&entry_batch_span);
-                    entry_batch_span.follows_from(&next_batch_span);
-                    // Update entry
-                    entry.temp_span = Some(entry_batch_span);
-                });
-
-                cached_batch = decode(&mut client, batches, &mut entries, &generation_health)
-                    .instrument(next_batch_span)
-                    .await;
-                waiting_tokens += 1;
-            }
-            metrics::gauge!("tgi_batch_current_size", 0.0);
-            metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn prefill(
-    client: &mut ShardedClient,
-    batch: Batch,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_id = batch.id;
-    metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill");
-
-    match client.prefill(batch).await {
-        Ok((generations, next_batch, timings)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-
-            let start_filtering_time = Instant::now();
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
-            metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            // Update health
-            generation_health.store(false, Ordering::SeqCst);
-            let _ = client.clear_cache(Some(batch_id)).await;
-            send_errors(err, entries);
-            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
-            None
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn decode(
-    client: &mut ShardedClient,
-    batches: Vec<CachedBatch>,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
-    metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode");
-
-    match client.decode(batches).await {
-        Ok((generations, next_batch, timings)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-
-            let start_filtering_time = Instant::now();
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            if let Some(concat_duration) = timings.concat {
-                metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode");
-            }
-            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
-            metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            generation_health.store(false, Ordering::SeqCst);
-            for id in batch_ids {
-                let _ = client.clear_cache(Some(id)).await;
-            }
-            send_errors(err, entries);
-            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
-            None
-        }
-    }
-}
-
-/// Filter a `batch` and remove all requests not present in `entries`
-#[instrument(skip_all)]
-async fn filter_batch(
-    client: &mut ShardedClient,
-    next_batch: Option<CachedBatch>,
-    entries: &IntMap<u64, Entry>,
-) -> Option<CachedBatch> {
-    let mut batch = next_batch?;
-
-    // No need to filter
-    if batch.size as usize == entries.len() {
-        return Some(batch);
-    }
-
-    let id = batch.id;
-
-    // Retain only requests that are still in entries
-    batch.request_ids.retain(|id| entries.contains_key(id));
-
-    if batch.request_ids.is_empty() {
-        // All requests have been filtered out
-        // Next batch is now empty
-        // Clear it from the Python shards cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.clear_cache(Some(id)).await.unwrap();
-        None
-    } else {
-        // Filter Python shard cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.filter_batch(id, batch.request_ids).await.unwrap()
-    }
-}
-
-/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
-/// and filter entries
-#[instrument(skip_all)]
-fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
-    generations.into_iter().for_each(|generation| {
-        let id = generation.request_id;
-        // Get entry
-        // We can `expect` here as the request id should always be in the entries
-        let entry = entries
-            .get(&id)
-            .expect("ID not found in entries. This is a bug.");
-
-        // Create and enter a span to link this function back to the entry
-        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
-        // Send generation responses back to the infer task
-        // If the receive an error from the Flume channel, it means that the client dropped the
-        // request and we need to stop generating hence why we unwrap_or(true)
-        let stopped = send_responses(generation, entry).map_err(|err| {
-            tracing::error!("Entry response channel error.");
-            metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
-            err
-        }).unwrap_or(true);
-        if stopped {
-            entries.remove(&id).expect("ID not found in entries. This is a bug.");
-        }
-    });
-}
-
-/// Send responses through the `entry` response channel
-fn send_responses(
-    generation: Generation,
-    entry: &Entry,
-) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
-    // Return directly if the channel is disconnected
-    if entry.response_tx.is_closed() {
-        metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
-        return Ok(true);
-    }
-
-    let mut stopped = false;
-
-    if let Some(prefill_tokens) = generation.prefill_tokens {
-        // Create Token objects
-        // We do that here instead of in the Python code as Rust for loops are faster
-        let prefill_tokens = prefill_tokens
-            .ids
-            .into_iter()
-            .zip(prefill_tokens.logprobs)
-            .zip(prefill_tokens.texts)
-            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
-            .collect();
-
-        // Send message
-        entry
-            .response_tx
-            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
-    }
-
-    // Create last Token
-    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
-    let n = tokens_.ids.len();
-    metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64);
-    let mut iterator = tokens_
-        .ids
-        .into_iter()
-        .zip(tokens_.logprobs)
-        .zip(tokens_.texts)
-        .zip(tokens_.is_special)
-        .enumerate()
-        .peekable();
-    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
-        let token = Token {
-            id,
-            text,
-            logprob,
-            special,
-        };
-        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
-            top_tokens_
-                .ids
-                .iter()
-                .zip(top_tokens_.logprobs.iter())
-                .zip(top_tokens_.texts.iter())
-                .zip(top_tokens_.is_special.iter())
-                .map(|(((&id, &logprob), text), &special)| Token {
-                    id,
-                    text: text.to_string(),
-                    logprob,
-                    special,
-                })
-                .collect()
-        } else {
-            vec![]
-        };
-        match (&generation.generated_text, iterator.peek()) {
-            (Some(generated_text), None) => {
-                // Generation has ended
-                stopped = true;
-                // Send message
-                entry.response_tx.send(Ok(InferStreamResponse::End {
-                    token,
-                    top_tokens,
-                    generated_text: GeneratedText::from(generated_text.clone()),
-                    queued: entry.queue_time,
-                    start: entry.batch_time.unwrap(),
-                }))?;
+                Some(tools)
             }
-            _ => {
-                // Send message
-                entry
-                    .response_tx
-                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
-            }
-        }
-    }
-
-    Ok(stopped)
-}
-
-/// Send errors to Infer for all `entries`
-#[instrument(skip_all)]
-fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
-    entries.drain().for_each(|(_, entry)| {
-        // Create and enter a span to link this function back to the entry
-        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
-        let err = InferError::GenerationError(error.to_string());
-        metrics::increment_counter!("tgi_request_failure", "err" => "generation");
-        tracing::error!("{err}");
-
-        // unwrap_or is valid here as we don't care if the receiver is gone.
-        entry
-            .response_tx
-            .send(Err(err))
-            .unwrap_or(());
-    });
-}
-
-impl From<text_generation_client::v3::GeneratedText> for GeneratedText {
-    fn from(value: text_generation_client::v3::GeneratedText) -> Self {
-        let v3_finish_reason =
-            text_generation_client::v3::FinishReason::try_from(value.finish_reason).unwrap();
-        let finish_reason = match v3_finish_reason {
-            text_generation_client::v3::FinishReason::Length => FinishReason::Length,
-            text_generation_client::v3::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
-            text_generation_client::v3::FinishReason::StopSequence => FinishReason::StopSequence,
+            None => None,
         };
 
-        Self {
-            text: value.text,
-            generated_tokens: value.generated_tokens,
-            finish_reason,
-            seed: value.seed,
-        }
+        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
+
+        self.template
+            .render(ChatTemplateInputs {
+                guideline,
+                messages,
+                bos_token: self.bos_token.as_deref(),
+                eos_token: self.eos_token.as_deref(),
+                add_generation_prompt: true,
+                tools,
+            })
+            .map_err(InferError::TemplateError)
     }
 }
 
 // tests
 #[cfg(test)]
 mod tests {
-    use crate::infer::raise_exception;
-    use crate::{ChatTemplateInputs, TextMessage};
+    use crate::infer::chat_template::raise_exception;
+    use crate::infer::ChatTemplate;
+    use crate::{
+        ChatTemplateInputs, Message, MessageContent, TextMessage, TokenizerConfigToken, Tool,
+    };
     use minijinja::Environment;
 
     #[test]
@@ -1163,6 +756,19 @@ mod tests {
                 },
                 target: "You are a friendly chatbot who always responds in the style of a pirateYou are a friendly chatbot who always responds in the style of a pirate### Instruction: Hello, how are you?### Response: I'm doing great. How can I help you today?### Instruction: I'd like to show off how chat templating works!",
             },
+            ChatTemplateTestItem {
+                name: "google/shieldgemma-9b",
+                chat_template: "{{- bos_token }}\n{%- if messages[-1].role == 'user' %}\n    {{- \"You are a policy expert trying to help determine whether a user prompt is in violation of the defined safety policies.\\n\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Human Question: \" + messages[-1].content }}\n    {{- \"\\n<end_of_turn>\\n\\n\" }}\n    {{- \"Our safety principle is defined in the below:\\n\\n\" }}\n    {{- \"* \" + guideline + \"\\n\" }}\n    {{- \"\\n===\\n\\n\" }}\n    {{- \"Does the human question violate the above principle? Your answer must start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\\n\\n\" }}\n{%- else %}\n    {{- \"You are a policy expert trying to help determine whether an AI response to a prompt is in violation of the defined safety policies.\\n\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Human Question: \" + messages[-2].content }}\n    {{- \"\\n<end_of_turn>\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Chatbot Response: \" + messages[-1].content }}\n    {{- \"\\n<end_of_turn>\\n\\n\" }}\n    {{- \"Our safety principle is defined in the below:\\n\\n\" }}\n    {{- \"* \" + guideline + \"\\n\" }}\n    {{- \"\\n===\\n\\n\" }}\n    {{- \"Does the Chatbot Response violate the above principle? Your answer must  start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\\n\\n\" }}\n{%- endif %}\n\n",
+                input: ChatTemplateInputs {
+                    messages: example_chat_with_system.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    guideline: Some("Do not use offensive language."),
+                    ..Default::default()
+                },
+                target: "<s>You are a policy expert trying to help determine whether a user prompt is in violation of the defined safety policies.\n\n<start_of_turn>\nHuman Question: I'd like to show off how chat templating works!\n<end_of_turn>\n\nOur safety principle is defined in the below:\n\n* Do not use offensive language.\n\n===\n\nDoes the human question violate the above principle? Your answer must start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\n\n",
+            },
         ];
 
         #[allow(unused_variables)] // name is unused
@@ -1187,4 +793,116 @@ mod tests {
             assert_eq!(result, target);
         }
     }
+
+    #[test]
+    fn test_chat_template_invalid_with_guideline() {
+        let ct = ChatTemplate::new(
+            "{{- bos_token }}\n{%- if messages[-1].role == 'user' %}\n    {{- \"You are a policy expert trying to help determine whether a user prompt is in violation of the defined safety policies.\\n\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Human Question: \" + messages[-1].content }}\n    {{- \"\\n<end_of_turn>\\n\\n\" }}\n    {{- \"Our safety principle is defined in the below:\\n\\n\" }}\n    {{- \"* \" + guideline + \"\\n\" }}\n    {{- \"\\n===\\n\\n\" }}\n    {{- \"Does the human question violate the above principle? Your answer must start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\\n\\n\" }}\n{%- else %}\n    {{- \"You are a policy expert trying to help determine whether an AI response to a prompt is in violation of the defined safety policies.\\n\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Human Question: \" + messages[-2].content }}\n    {{- \"\\n<end_of_turn>\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Chatbot Response: \" + messages[-1].content }}\n    {{- \"\\n<end_of_turn>\\n\\n\" }}\n    {{- \"Our safety principle is defined in the below:\\n\\n\" }}\n    {{- \"* \" + guideline + \"\\n\" }}\n    {{- \"\\n===\\n\\n\" }}\n    {{- \"Does the Chatbot Response violate the above principle? Your answer must  start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\\n\\n\" }}\n{%- endif %}\n\n".to_string(),
+            Some(TokenizerConfigToken::String("<s>".to_string())),
+            Some(TokenizerConfigToken::String("</s>".to_string())),
+        );
+
+        // convert TextMessage to Message
+        let msgs: Vec<Message> = vec![
+            Message {
+                name: None,
+                role: "user".to_string(),
+                content: MessageContent::SingleText(
+                    "I'd like to show off how chat templating works!".to_string(),
+                ),
+            },
+            Message {
+                name: None,
+                role: "assistant".to_string(),
+                content: MessageContent::SingleText(
+                    "I'm doing great. How can I help you today?".to_string(),
+                ),
+            },
+            Message {
+                name: None,
+                role: "user".to_string(),
+                content: MessageContent::SingleText("Hello, how are you?".to_string()),
+            },
+        ];
+
+        let result = ct.apply(None, msgs, None);
+
+        match result {
+            Ok(_) => panic!("Should have failed since no guideline is provided"),
+            Err(e) => {
+                assert_eq!(e.to_string(), "Missing template vatiable: guideline")
+            }
+        }
+    }
+
+    #[test]
+    fn test_chat_template_with_default_tool_template() {
+        let ct = ChatTemplate::new(
+            "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}".to_string(),
+            Some(TokenizerConfigToken::String("<s>".to_string())),
+            Some(TokenizerConfigToken::String("</s>".to_string())),
+        );
+
+        // convert TextMessage to Message
+        let msgs: Vec<Message> = vec![
+            Message {
+                name: None,
+                role: "user".to_string(),
+                content: MessageContent::SingleText(
+                    "I'd like to show off how chat templating works!".to_string(),
+                ),
+            },
+            Message {
+                name: None,
+                role: "assistant".to_string(),
+                content: MessageContent::SingleText("Great! How can I help you today?".to_string()),
+            },
+            Message {
+                name: None,
+                role: "user".to_string(),
+                content: MessageContent::SingleText("Just testing".to_string()),
+            },
+        ];
+        let tools_string = r#"[{"type": "function","function": {"name": "get_current_weather","description": "Get the current weather","parameters": {"type": "object","properties": {"location": {"type": "string","description": "The city and state, e.g. San Francisco, CA"},"format": {"type": "string","enum": ["celsius", "fahrenheit"],"description": "The temperature unit to use. Infer this from the users location."}},"required": ["location", "format"]}}}]"#.to_string();
+        let tools: Vec<Tool> = serde_json::from_str(&tools_string).unwrap();
+        let tool_prompt = "This default prompt will be used".to_string();
+        let tools_and_prompt = Some((tools, tool_prompt));
+        let result = ct.apply(None, msgs, tools_and_prompt);
+        let expected = "<s>[INST] I'd like to show off how chat templating works! [/INST]Great! How can I help you today?</s> [INST] Just testing\n---\n[{\"type\":\"function\",\"function\":{\"description\":\"Get the current weather\",\"name\":\"get_current_weather\",\"arguments\":{\"properties\":{\"format\":{\"description\":\"The temperature unit to use. Infer this from the users location.\",\"enum\":[\"celsius\",\"fahrenheit\"],\"type\":\"string\"},\"location\":{\"description\":\"The city and state, e.g. San Francisco, CA\",\"type\":\"string\"}},\"required\":[\"location\",\"format\"],\"type\":\"object\"}}}]\nThis default prompt will be used [/INST]".to_string();
+        assert_eq!(result.unwrap(), expected);
+    }
+
+    #[test]
+    fn test_chat_template_with_custom_tool_template() {
+        // chat template from meta-llama/Meta-Llama-3.1-8B-Instruct
+        let ct = ChatTemplate::new(
+            "{{- bos_token }}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n".to_string(),
+            Some(TokenizerConfigToken::String("<s>".to_string())),
+            Some(TokenizerConfigToken::String("</s>".to_string())),
+        );
+        let msgs: Vec<Message> = vec![
+            Message {
+                name: None,
+                role: "system".to_string(),
+                content: MessageContent::SingleText(
+                    "Youre a helpful assistant! Answer the users question best you can."
+                        .to_string(),
+                ),
+            },
+            Message {
+                name: None,
+                role: "user".to_string(),
+                content: MessageContent::SingleText(
+                    "What is the weather like in Brooklyn, New York?".to_string(),
+                ),
+            },
+        ];
+        let tools_string = r#"[{"type": "function","function": {"name": "get_current_weather","description": "Get the current weather","parameters": {"type": "object","properties": {"location": {"type": "string","description": "The city and state, e.g. San Francisco, CA"},"format": {"type": "string","enum": ["celsius", "fahrenheit"],"description": "The temperature unit to use. Infer this from the users location."}},"required": ["location", "format"]}}}]"#.to_string();
+        let tools: Vec<Tool> = serde_json::from_str(&tools_string).unwrap();
+        let tool_prompt = "This default prompt will be used".to_string();
+        let tools_and_prompt = Some((tools, tool_prompt));
+        let result = ct.apply(None, msgs, tools_and_prompt);
+        let expected = "<s><|start_header_id|>system<|end_header_id|>\n\nEnvironment: ipython\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYoure a helpful assistant! Answer the users question best you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.\n\nRespond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.Do not use variables.\n\n{\n    \"function\": {\n        \"arguments\": {\n            \"properties\": {\n                \"format\": {\n                    \"description\": \"The temperature unit to use. Infer this from the users location.\",\n                    \"enum\": [\n                        \"celsius\",\n                        \"fahrenheit\"\n                    ],\n                    \"type\": \"string\"\n                },\n                \"location\": {\n                    \"description\": \"The city and state, e.g. San Francisco, CA\",\n                    \"type\": \"string\"\n                }\n            },\n            \"required\": [\n                \"location\",\n                \"format\"\n            ],\n            \"type\": \"object\"\n        },\n        \"description\": \"Get the current weather\",\n        \"name\": \"get_current_weather\"\n    },\n    \"type\": \"function\"\n}\n\nWhat is the weather like in Brooklyn, New York?\n---\nThis default prompt will be used<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".to_string();
+        assert_eq!(result.unwrap(), expected);
+    }
 }
diff --git a/router/src/infer/health.rs b/router/src/infer/health.rs
deleted file mode 100644
index 4320c1a4d16d9e300f8e6ecd99df61283cdc8668..0000000000000000000000000000000000000000
--- a/router/src/infer/health.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-use text_generation_client::Health;
-
-#[derive(Clone)]
-pub(crate) struct HealthCheck {
-    client: Arc<dyn Health + Send + Sync>,
-    generation_health: Arc<AtomicBool>,
-}
-
-impl HealthCheck {
-    pub(crate) fn new(
-        client: Arc<dyn Health + Send + Sync>,
-        generation_health: Arc<AtomicBool>,
-    ) -> Self {
-        Self {
-            client,
-            generation_health,
-        }
-    }
-
-    pub(crate) async fn check(&mut self) -> bool {
-        let value = if self.generation_health.load(Ordering::SeqCst) {
-            // Generation is healthy, we only check that the shards can allocate on device
-            self.client.device_health().await
-        } else {
-            self.client.model_health().await
-        }
-        .is_ok();
-        // Update generation health
-        self.generation_health.store(value, Ordering::SeqCst);
-        value
-    }
-}
diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs
index 49282eb9eca953e36b69b8bfd92952e4e7a8dba2..896f4f4318f1b7cc2f6d9c1d053ad242049da5f7 100644
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@@ -1,23 +1,20 @@
-mod health;
-pub(crate) mod v2;
-pub(crate) mod v3;
-
-pub(crate) use health::HealthCheck;
+// pub(crate) mod v2;
+mod chat_template;
+pub mod tool_grammar;
 
 use crate::validation::{ValidGenerateRequest, Validation, ValidationError};
+use crate::Tool;
 use crate::{
-    ChatTemplateInputs, ChatTemplateVersions, FinishReason, GenerateRequest, HubProcessorConfig,
-    HubTokenizerConfig, Message, MessageChunk, PrefillToken, TextMessage, Token,
-};
-use crate::{
-    FunctionRef, FunctionsMap, GrammarType, Properties, TokenizerConfigToken, Tool, ToolType, Tools,
+    ChatTemplateVersions, FinishReason, GenerateRequest, HubProcessorConfig, HubTokenizerConfig,
+    Message, PrefillToken, Token,
 };
+use async_stream::stream;
+use async_trait::async_trait;
+use chat_template::ChatTemplate;
 use futures::future::try_join_all;
-use minijinja::{Environment, ErrorKind, Template};
-use minijinja_contrib::pycompat;
-
-use serde_json::{json, Map, Value};
-use std::collections::HashMap;
+use futures::Stream;
+use minijinja::ErrorKind;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use thiserror::Error;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
@@ -26,12 +23,14 @@ use tokio_stream::wrappers::UnboundedReceiverStream;
 use tokio_stream::StreamExt;
 use tracing::instrument;
 
-pub(crate) trait Scheduler {
+#[async_trait]
+pub trait Backend {
     fn schedule(
         &self,
         request: ValidGenerateRequest,
-        permit: OwnedSemaphorePermit,
-    ) -> Result<GenerateStreamResponse, InferError>;
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError>;
+
+    async fn health(&self, current_health: bool) -> bool;
 }
 
 /// Inference struct
@@ -39,18 +38,20 @@ pub(crate) trait Scheduler {
 pub struct Infer {
     /// Validation
     validation: Validation,
-    /// Request scheduler
-    scheduler: Arc<dyn Scheduler + Send + Sync>,
+    /// Request backend
+    backend: Arc<dyn Backend + Send + Sync>,
     /// Chat template
     chat_template: Option<ChatTemplate>,
     /// Inference limit
     limit_concurrent_requests: Arc<Semaphore>,
+    /// Backend health
+    backend_health: Arc<AtomicBool>,
 }
 
 impl Infer {
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
-        scheduler: Arc<dyn Scheduler + Send + Sync>,
+        backend: impl Backend + Send + Sync + 'static,
         validation: Validation,
         max_concurrent_requests: usize,
         tokenizer_config: HubTokenizerConfig,
@@ -71,39 +72,62 @@ impl Infer {
         // Inference limit with a semaphore
         let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));
 
+        // Backend health
+        let backend_health = Arc::new(AtomicBool::new(false));
+
         Self {
             validation,
-            scheduler,
+            backend: Arc::new(backend),
             chat_template,
             limit_concurrent_requests: semaphore,
+            backend_health,
         }
     }
 
     /// Add a new request to the queue and return a stream of InferStreamResponse
     #[instrument(skip_all)]
-    pub(crate) async fn generate_stream(
-        &self,
+    pub(crate) async fn generate_stream<'a>(
+        &'a self,
         request: GenerateRequest,
-    ) -> Result<GenerateStreamResponse, InferError> {
+    ) -> Result<
+        (
+            OwnedSemaphorePermit,
+            u32, // input_length
+            impl Stream<Item = Result<InferStreamResponse, InferError>> + 'a,
+        ),
+        InferError,
+    > {
         // Limit concurrent requests by acquiring a permit from the semaphore
         let permit = self
             .clone()
             .limit_concurrent_requests
             .try_acquire_owned()
             .map_err(|err| {
-                metrics::increment_counter!("tgi_request_failure", "err" => "overloaded");
+                metrics::counter!("tgi_request_failure", "err" => "overloaded").increment(1);
                 tracing::error!("{err}");
                 err
             })?;
 
         // Validate request
         let valid_request = self.validation.validate(request).await.map_err(|err| {
-            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+            metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
             tracing::error!("{err}");
             err
         })?;
 
-        self.scheduler.schedule(valid_request, permit)
+        let input_length = valid_request.input_length;
+        let mut generation_stream = self.backend.schedule(valid_request)?;
+
+        // Wrap generation stream to update the backend health if the stream contains an error
+        let final_stream = stream! {
+            while let Some(response) = generation_stream.next().await {
+                yield response.inspect_err(|_err| {
+                    self.backend_health.store(false, Ordering::SeqCst);
+                })
+            }
+        };
+
+        Ok((permit, input_length, final_stream))
     }
 
     /// Tokenizer the input
@@ -114,10 +138,11 @@ impl Infer {
     ) -> Result<Option<tokenizers::Encoding>, InferError> {
         // Tokenize request
         let inputs = request.inputs;
+        let add_special_tokens = request.add_special_tokens;
         let truncate = request.parameters.truncate;
         let encoding = self
             .validation
-            .tokenize(inputs, truncate)
+            .tokenize(inputs, add_special_tokens, truncate)
             .await
             .map_err(|err| {
                 tracing::error!("Tokenization {err}");
@@ -132,15 +157,16 @@ impl Infer {
     #[instrument(skip_all)]
     pub(crate) fn apply_chat_template(
         &self,
+        guideline: Option<String>,
         messages: Vec<Message>,
-        grammar_with_prompt: Option<(GrammarType, String)>,
+        tools_and_prompt: Option<(Vec<Tool>, String)>,
     ) -> Result<String, InferError> {
         self.chat_template
             .as_ref()
             .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
-            .apply(messages, grammar_with_prompt)
+            .apply(guideline.as_deref(), messages, tools_and_prompt)
             .map_err(|e| {
-                metrics::increment_counter!("tgi_request_failure", "err" => "template");
+                metrics::counter!("tgi_request_failure", "err" => "template").increment(1);
                 tracing::error!("{e}");
                 e
             })
@@ -155,7 +181,7 @@ impl Infer {
         let use_top_tokens = request.parameters.top_n_tokens.is_some_and(|x| x > 0);
 
         // Create stream and keep semaphore permit as long as generate lives
-        let (_permit, _input_length, mut stream) = self.generate_stream(request).await?;
+        let (_permit, _input_length, stream) = self.generate_stream(request).await?;
 
         // Return values
         let mut result_prefill = Vec::new();
@@ -165,6 +191,8 @@ impl Infer {
         let mut result_start = None;
         let mut result_queued = None;
 
+        let mut stream = Box::pin(stream);
+
         // Iterate on stream
         while let Some(response) = stream.next().await {
             match response? {
@@ -214,7 +242,7 @@ impl Infer {
             })
         } else {
             let err = InferError::IncompleteGeneration;
-            metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
+            metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
             tracing::error!("{err}");
             Err(err)
         }
@@ -256,222 +284,28 @@ impl Infer {
         let best_response = infer_responses.remove(max_index);
         Ok((best_response, infer_responses))
     }
-}
-
-/// Raise a exception (custom function) used in the chat templates
-fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
-    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
-}
 
-#[derive(Clone)]
-struct ChatTemplate {
-    template: Template<'static, 'static>,
-    bos_token: Option<String>,
-    eos_token: Option<String>,
-    use_default_tool_template: bool,
-}
-
-impl ChatTemplate {
-    fn new(
-        template: String,
-        bos_token: Option<TokenizerConfigToken>,
-        eos_token: Option<TokenizerConfigToken>,
-    ) -> Self {
-        let mut env = Box::new(Environment::new());
-        // enable things like .strip() or .capitalize()
-        env.set_unknown_method_callback(pycompat::unknown_method_callback);
-        let template_str = template.into_boxed_str();
-        env.add_function("raise_exception", raise_exception);
-
-        // check if contains the tools variable within the template
-        let use_default_tool_template =
-            !template_str.as_ref().replace(' ', "").contains("{{tools}}");
-        // leaking env and template_str as read-only, static resources for performance.
-        let template = Box::leak(env)
-            .template_from_str(Box::leak(template_str))
-            .unwrap();
-
-        Self {
-            template,
-            bos_token: bos_token.map(|token| token.as_str().to_string()),
-            eos_token: eos_token.map(|token| token.as_str().to_string()),
-            use_default_tool_template,
-        }
-    }
-
-    fn apply(
-        &self,
-        mut messages: Vec<Message>,
-        grammar_with_prompt: Option<(GrammarType, String)>,
-    ) -> Result<String, InferError> {
-        if self.use_default_tool_template {
-            if let Some(last_message) = messages.last_mut() {
-                if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
-                    last_message.content.push(MessageChunk::Text {
-                        text: format!("\n---\n{}\n{}", tool_prompt, tools),
-                    });
-                }
-            }
-        }
-
-        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
-
-        self.template
-            .render(ChatTemplateInputs {
-                messages,
-                bos_token: self.bos_token.as_deref(),
-                eos_token: self.eos_token.as_deref(),
-                add_generation_prompt: true,
-                tools: None,
-                tools_prompt: None,
-            })
-            .map_err(InferError::TemplateError)
+    #[instrument(skip(self))]
+    pub(crate) async fn health(&self) -> bool {
+        let health = self
+            .backend
+            .health(self.backend_health.load(Ordering::SeqCst))
+            .await;
+        self.backend_health.store(health, Ordering::SeqCst);
+        health
     }
 }
 
-pub struct ToolGrammar {}
-
-impl ToolGrammar {
-    pub fn apply(
-        tools: Option<Vec<Tool>>,
-        tool_choice: Option<ToolType>,
-    ) -> Result<Option<Tools>, InferError> {
-        if let Some((req_tools, tool_choice)) = tools.zip(tool_choice) {
-            // let tool_prompt = tool_prompt.unwrap_or_default();
-            let tools_to_use = match tool_choice {
-                ToolType::FunctionName(name) => {
-                    vec![req_tools
-                        .iter()
-                        .find(|tool| tool.function.name == *name)
-                        .unwrap_or_else(|| panic!("Tool with name {} not found", name))
-                        .clone()]
-                }
-                ToolType::Function { function } => {
-                    let tool = req_tools
-                        .iter()
-                        .find(|tool| tool.function.name == function.name)
-                        .unwrap_or_else(|| panic!("Tool with name {} not found", function.name))
-                        .clone();
-                    vec![tool]
-                }
-                ToolType::OneOf => req_tools.to_owned(),
-            };
-
-            // adds the error notification function for LLM feedback if required
-            let mut text_response_properties = Map::new();
-            text_response_properties.insert(
-                "error".to_string(),
-                serde_json::json!({
-                    "type": "string",
-                    "description": "The error or issue to notify"
-                }),
-            );
-            text_response_properties.insert(
-                "_name".to_string(),
-                serde_json::json!({
-                    "type": "string",
-                    "const": "notify_error"
-                }),
-            );
-
-            let functions: HashMap<String, serde_json::Value> = tools_to_use
-                .iter()
-                .map(|tool| {
-                    let func = tool.function.clone();
-
-                    // Clone the existing parameters, which are expected to be a JSON object
-                    let mut params = if let Value::Object(params) = &func.arguments {
-                        params.clone()
-                    } else {
-                        Map::new()
-                    };
-
-                    // Insert the function's description at the top level, outside of properties
-                    params.insert(
-                        "description".to_string(),
-                        Value::String(func.description.clone().unwrap_or_default()),
-                    );
-
-                    // Ensure 'properties' exists and is an object
-                    let properties = params
-                        .entry("properties".to_string())
-                        .or_insert_with(|| json!({}))
-                        .as_object_mut()
-                        .unwrap();
-
-                    // Insert the constant for the function name inside 'properties'
-                    properties.insert(
-                        "_name".to_string(),
-                        json!({
-                            "type": "string",
-                            "const": func.name.clone(),
-                            // "description": "The name of the function"
-                        }),
-                    );
-
-                    // Check if 'required' exists, and it is an array. If not, create an empty array.
-                    let required = params
-                        .entry("required".to_string())
-                        .or_insert_with(|| json!([]))
-                        .as_array_mut()
-                        .unwrap();
-
-                    // Add 'name' to the 'required' array if it is not already present
-                    if !required.iter().any(|r| r == "_name") {
-                        required.push(json!("_name"));
-                    }
-
-                    (func.name, Value::Object(params))
-                })
-                .chain([(
-                    "notify_error".to_string(),
-                    serde_json::json!({
-                        "properties": text_response_properties,
-                        "required": ["error", "_name"],
-                        "type": "object"
-                    }),
-                )])
-                .collect();
-
-            let tools = Tools {
-                functions_map: FunctionsMap { functions },
-                properties: Properties {
-                    function: tools_to_use
-                        .iter()
-                        .map(|tool| FunctionRef {
-                            ref_path: format!("#/$functions/{}", tool.function.name.clone()),
-                        })
-                        .chain(std::iter::once(FunctionRef {
-                            ref_path: "#/$functions/notify_error".to_string(),
-                        }))
-                        .collect(),
-                },
-            };
-
-            return Ok(Some(tools));
-        }
-        // Err(InferError::ToolError("No tools provided".to_string()))
-        Ok(None)
-    }
-}
-
-/// Type alias for generation responses
-pub(crate) type GenerateStreamResponse = (
-    OwnedSemaphorePermit,
-    u32, // input_length
-    UnboundedReceiverStream<Result<InferStreamResponse, InferError>>,
-);
-
 #[derive(Debug)]
-pub(crate) struct GeneratedText {
-    pub(crate) text: String,
-    pub(crate) generated_tokens: u32,
-    pub(crate) finish_reason: FinishReason,
-    pub(crate) seed: Option<u64>,
+pub struct GeneratedText {
+    pub text: String,
+    pub generated_tokens: u32,
+    pub finish_reason: FinishReason,
+    pub seed: Option<u64>,
 }
 
 #[derive(Debug)]
-pub(crate) enum InferStreamResponse {
+pub enum InferStreamResponse {
     // Optional first message
     Prefill(Vec<PrefillToken>),
     // Intermediate messages
@@ -513,10 +347,16 @@ pub enum InferError {
     ValidationError(#[from] ValidationError),
     #[error("Incomplete generation")]
     IncompleteGeneration,
+    #[error("Incomplete generation stream")]
+    IncompleteGenerationStream,
     #[error("Template error: {0}")]
     TemplateError(#[from] minijinja::Error),
+    #[error("Missing template vatiable: {0}")]
+    MissingTemplateVariable(String),
     #[error("Tool error: {0}")]
     ToolError(String),
+    #[error("Stream event serialization error")]
+    StreamSerializationError(String),
 }
 
 impl InferError {
@@ -526,8 +366,11 @@ impl InferError {
             InferError::Overloaded(_) => "overloaded",
             InferError::ValidationError(_) => "validation",
             InferError::IncompleteGeneration => "incomplete_generation",
+            InferError::IncompleteGenerationStream => "incomplete_generation_stream",
             InferError::TemplateError(_) => "template_error",
+            InferError::MissingTemplateVariable(_) => "missing_template_variable",
             InferError::ToolError(_) => "tool_error",
+            InferError::StreamSerializationError(_) => "stream_serialization_error",
         }
     }
 }
diff --git a/router/src/infer/tool_grammar.rs b/router/src/infer/tool_grammar.rs
new file mode 100644
index 0000000000000000000000000000000000000000..f86205fb53279598e22e528c5a24728c879e935b
--- /dev/null
+++ b/router/src/infer/tool_grammar.rs
@@ -0,0 +1,123 @@
+use crate::infer::InferError;
+use crate::{
+    FunctionDefinition, FunctionRef, FunctionsMap, JsonSchemaTool, Properties, Tool, ToolChoice,
+    ToolType,
+};
+use serde_json::{json, Map, Value};
+use std::collections::HashMap;
+
+pub(crate) struct ToolGrammar {}
+
+impl ToolGrammar {
+    // find a tool by name
+    fn find_tool_by_name(tools: &[Tool], name: &str) -> Result<Tool, InferError> {
+        tools
+            .iter()
+            .find(|tool| tool.function.name == name)
+            .cloned()
+            .ok_or_else(|| InferError::ToolError(format!("Tool with name {} not found", name)))
+    }
+
+    pub fn apply(
+        tools: Vec<Tool>,
+        tool_choice: ToolChoice,
+    ) -> Result<(Vec<Tool>, Option<JsonSchemaTool>), InferError> {
+        // if no tools are provided, we return None
+        if tools.is_empty() {
+            return Ok((tools, None));
+        }
+
+        let tool_choice = tool_choice.0.unwrap_or(ToolType::OneOf);
+
+        let mut tools = tools.clone();
+
+        // add the no_tool function to the tools
+        let no_tool = Tool {
+            r#type: "function".to_string(),
+            function: FunctionDefinition {
+                name: "no_tool".to_string(),
+                description: Some("Open ened response with no specific tool selected".to_string()),
+                arguments: json!({
+                    "type": "object",
+                    "properties": {
+                        "content": {
+                            "type": "string",
+                            "description": "The response content",
+                        }
+                    },
+                    "required": ["content"]
+                }),
+            },
+        };
+        tools.push(no_tool);
+
+        // if tools are provided and no tool_choice we default to the OneOf
+        let tools_to_use = match tool_choice {
+            ToolType::Function(function) => {
+                vec![Self::find_tool_by_name(&tools, &function.name)?]
+            }
+            ToolType::OneOf => tools.clone(),
+            ToolType::NoTool => return Ok((tools, None)),
+        };
+
+        let functions: HashMap<String, serde_json::Value> = tools_to_use
+            .iter()
+            .map(|tool| {
+                let func = tool.function.clone();
+
+                let mut params = Map::new();
+
+                params.insert(
+                    "description".to_string(),
+                    Value::String(func.description.unwrap_or_default()),
+                );
+
+                let mut properties = Map::new();
+                let mut required = vec![Value::String("_name".to_string())];
+
+                properties.insert(
+                    "_name".to_string(),
+                    json!({
+                        "type": "string",
+                        "const": func.name.clone(),
+                    }),
+                );
+
+                if let Value::Object(args) = func.arguments {
+                    if let Some(Value::Object(props)) = args.get("properties") {
+                        properties.extend(props.clone());
+                    }
+                    if let Some(Value::Array(reqs)) = args.get("required") {
+                        required.extend(reqs.clone());
+                    }
+                    params.insert(
+                        "additionalProperties".to_string(),
+                        Value::Bool(
+                            args.get("additionalProperties").and_then(|v| v.as_str())
+                                == Some("true"),
+                        ),
+                    );
+                }
+
+                params.insert("properties".to_string(), Value::Object(properties));
+                params.insert("required".to_string(), Value::Array(required));
+
+                (func.name, Value::Object(params))
+            })
+            .collect();
+
+        let tool_schema = JsonSchemaTool {
+            functions_map: FunctionsMap { functions },
+            properties: Properties {
+                function: tools_to_use
+                    .iter()
+                    .map(|tool| FunctionRef {
+                        ref_path: format!("#/$functions/{}", tool.function.name.clone()),
+                    })
+                    .collect(),
+            },
+        };
+
+        Ok((tools, Some(tool_schema)))
+    }
+}
diff --git a/router/src/infer/v2/mod.rs b/router/src/infer/v2/mod.rs
deleted file mode 100644
index 8b4f6bab30b81f5b972076fa4fcc8b3a5ba3966b..0000000000000000000000000000000000000000
--- a/router/src/infer/v2/mod.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-mod queue;
-mod scheduler;
-
-pub(crate) use scheduler::SchedulerV2;
diff --git a/router/src/infer/v2/scheduler.rs b/router/src/infer/v2/scheduler.rs
deleted file mode 100644
index e4c3de267923b78bf424b726c26439001a528b9f..0000000000000000000000000000000000000000
--- a/router/src/infer/v2/scheduler.rs
+++ /dev/null
@@ -1,1184 +0,0 @@
-/// Batching and inference logic
-use crate::infer::v2::queue::{Entry, Queue};
-use crate::infer::{
-    GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse, Scheduler,
-};
-use crate::validation::ValidGenerateRequest;
-use crate::{FinishReason, PrefillToken, Token};
-use nohash_hasher::IntMap;
-use std::sync::{
-    atomic::{AtomicBool, Ordering},
-    Arc,
-};
-use text_generation_client::v2::{Batch, CachedBatch, Generation, ShardedClient};
-use text_generation_client::ClientError;
-use tokio::sync::mpsc::error::SendError;
-use tokio::sync::{mpsc, Notify, OwnedSemaphorePermit};
-use tokio::time::Instant;
-use tokio_stream::wrappers::UnboundedReceiverStream;
-use tracing::{info_span, instrument, Instrument, Span};
-
-pub(crate) struct SchedulerV2 {
-    /// Request queue
-    queue: Queue,
-    /// Notify batcher on queue appends
-    batching_task_notifier: Arc<Notify>,
-}
-
-impl SchedulerV2 {
-    #[allow(clippy::too_many_arguments)]
-    pub(crate) fn new(
-        client: ShardedClient,
-        waiting_served_ratio: f32,
-        max_batch_prefill_tokens: u32,
-        max_batch_total_tokens: u32,
-        max_waiting_tokens: usize,
-        max_batch_size: Option<usize>,
-        requires_padding: bool,
-        window_size: Option<u32>,
-        speculate: u32,
-        generation_health: Arc<AtomicBool>,
-    ) -> Self {
-        // Infer shared state
-        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
-            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
-        } else {
-            false
-        };
-        let block_size = if flashdecoding { 256 } else { 16 };
-        let queue = Queue::new(requires_padding, block_size, window_size, speculate);
-        let batching_task_notifier = Arc::new(Notify::new());
-
-        // Spawn batching background task that contains all the inference logic
-        tokio::spawn(batching_task(
-            client,
-            waiting_served_ratio,
-            max_batch_prefill_tokens,
-            max_batch_total_tokens,
-            max_waiting_tokens,
-            max_batch_size,
-            queue.clone(),
-            batching_task_notifier.clone(),
-            generation_health,
-        ));
-
-        Self {
-            queue,
-            batching_task_notifier,
-        }
-    }
-}
-
-impl Scheduler for SchedulerV2 {
-    #[instrument(skip_all)]
-    fn schedule(
-        &self,
-        request: ValidGenerateRequest,
-        permit: OwnedSemaphorePermit,
-    ) -> Result<GenerateStreamResponse, InferError> {
-        // MPSC channel to communicate with the background batching task
-        let (response_tx, response_rx) = mpsc::unbounded_channel();
-        let input_length = request.input_length;
-
-        // Append the request to the queue
-        self.queue.append(Entry {
-            request,
-            response_tx,
-            span: Span::current(),
-            temp_span: None,
-            queue_time: Instant::now(),
-            batch_time: None,
-        });
-
-        // Notify the background task that we have a new entry in the queue that needs
-        // to be batched
-        self.batching_task_notifier.notify_one();
-
-        // Return stream
-        Ok((
-            permit,
-            input_length,
-            UnboundedReceiverStream::new(response_rx),
-        ))
-    }
-}
-
-/// Batching logic
-/// Will be launched in a background Tokio task
-///
-/// Batches requests and sends them to the inference server
-#[allow(clippy::too_many_arguments)]
-pub(crate) async fn batching_task(
-    mut client: ShardedClient,
-    waiting_served_ratio: f32,
-    max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: u32,
-    max_waiting_tokens: usize,
-    max_batch_size: Option<usize>,
-    queue: Queue,
-    notifier: Arc<Notify>,
-    generation_health: Arc<AtomicBool>,
-) {
-    // Infinite loop
-    loop {
-        // Wait for a notification from the Infer struct
-        notifier.notified().await;
-
-        // Get the next batch from the queue
-        // This batch might be smaller than the maximum batch size if there are not enough requests
-        // waiting in the queue
-        while let Some((mut entries, batch, span)) = queue
-            .next_batch(
-                None,
-                max_batch_size,
-                max_batch_prefill_tokens,
-                max_batch_total_tokens,
-            )
-            .await
-        {
-            let mut cached_batch = prefill(&mut client, batch, &mut entries, &generation_health)
-                .instrument(span)
-                .await;
-            let mut waiting_tokens = 1;
-
-            // We loop until we do not receive any cached batch from the inference server (== until
-            // all requests have met their stopping criteria)
-            while let Some(batch) = cached_batch {
-                // Get current batch info
-                let batch_size = batch.size;
-                let batch_max_tokens = batch.max_tokens;
-                let mut batches = vec![batch];
-                metrics::gauge!("tgi_batch_current_size", batch_size as f64);
-                metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64);
-
-                let min_size = if waiting_tokens >= max_waiting_tokens {
-                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
-                    // to add a new batch even though its size might be small
-                    None
-                } else {
-                    // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
-                };
-
-                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
-                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
-
-                // Try to get a new batch
-                if let Some((mut new_entries, new_batch, span)) = queue
-                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
-                    .await
-                {
-                    // Tracking metrics
-                    if min_size.is_some() {
-                        metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure");
-                    } else {
-                        metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded");
-                    }
-
-                    entries.iter_mut().for_each(|(_, entry)| {
-                        // Create a new span to add the info that this entry is waiting
-                        // because a new batch is being computed
-                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
-                        // Add relationships
-                        span.follows_from(&entry_waiting_span);
-                        entry_waiting_span.follows_from(&span);
-                        // Update entry
-                        entry.temp_span = Some(entry_waiting_span);
-                    });
-
-                    // Generate one token for this new batch to have the attention past in cache
-                    let new_cached_batch =
-                        prefill(&mut client, new_batch, &mut new_entries, &generation_health)
-                            .instrument(span)
-                            .await;
-                    // Reset waiting counter
-                    waiting_tokens = 1;
-                    // Extend current batch with the new batch
-                    if let Some(new_cached_batch) = new_cached_batch {
-                        entries.extend(new_entries);
-                        batches.push(new_cached_batch);
-                    }
-                }
-
-                // Create span for this batch to add context to inference calls
-                let next_batch_size = entries.len();
-                let next_batch_span =
-                    info_span!(parent: None, "batch", batch_size = next_batch_size);
-                entries.iter_mut().for_each(|(_, entry)| {
-                    // Create a new span to link the batch back to this entry
-                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
-                    // Add relationships
-                    next_batch_span.follows_from(&entry_batch_span);
-                    entry_batch_span.follows_from(&next_batch_span);
-                    // Update entry
-                    entry.temp_span = Some(entry_batch_span);
-                });
-
-                cached_batch = decode(&mut client, batches, &mut entries, &generation_health)
-                    .instrument(next_batch_span)
-                    .await;
-                waiting_tokens += 1;
-            }
-            metrics::gauge!("tgi_batch_current_size", 0.0);
-            metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn prefill(
-    client: &mut ShardedClient,
-    batch: Batch,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_id = batch.id;
-    metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill");
-
-    match client.prefill(batch).await {
-        Ok((generations, next_batch, timings)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-
-            let start_filtering_time = Instant::now();
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill");
-            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
-            metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            // Update health
-            generation_health.store(false, Ordering::SeqCst);
-            let _ = client.clear_cache(Some(batch_id)).await;
-            send_errors(err, entries);
-            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
-            None
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn decode(
-    client: &mut ShardedClient,
-    batches: Vec<CachedBatch>,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
-    metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode");
-
-    match client.decode(batches).await {
-        Ok((generations, next_batch, timings)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-
-            let start_filtering_time = Instant::now();
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            if let Some(concat_duration) = timings.concat {
-                metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode");
-            }
-            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode");
-            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
-            metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            generation_health.store(false, Ordering::SeqCst);
-            for id in batch_ids {
-                let _ = client.clear_cache(Some(id)).await;
-            }
-            send_errors(err, entries);
-            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
-            None
-        }
-    }
-}
-
-/// Filter a `batch` and remove all requests not present in `entries`
-#[instrument(skip_all)]
-async fn filter_batch(
-    client: &mut ShardedClient,
-    next_batch: Option<CachedBatch>,
-    entries: &IntMap<u64, Entry>,
-) -> Option<CachedBatch> {
-    let mut batch = next_batch?;
-
-    // No need to filter
-    if batch.size as usize == entries.len() {
-        return Some(batch);
-    }
-
-    let id = batch.id;
-
-    // Retain only requests that are still in entries
-    batch.request_ids.retain(|id| entries.contains_key(id));
-
-    if batch.request_ids.is_empty() {
-        // All requests have been filtered out
-        // Next batch is now empty
-        // Clear it from the Python shards cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.clear_cache(Some(id)).await.unwrap();
-        None
-    } else {
-        // Filter Python shard cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.filter_batch(id, batch.request_ids).await.unwrap()
-    }
-}
-
-/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
-/// and filter entries
-#[instrument(skip_all)]
-fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
-    generations.into_iter().for_each(|generation| {
-        let id = generation.request_id;
-        // Get entry
-        // We can `expect` here as the request id should always be in the entries
-        let entry = entries
-            .get(&id)
-            .expect("ID not found in entries. This is a bug.");
-
-        // Create and enter a span to link this function back to the entry
-        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
-        // Send generation responses back to the infer task
-        // If the receive an error from the Flume channel, it means that the client dropped the
-        // request and we need to stop generating hence why we unwrap_or(true)
-        let stopped = send_responses(generation, entry).map_err(|err| {
-            tracing::error!("Entry response channel error.");
-            metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
-            err
-        }).unwrap_or(true);
-        if stopped {
-            entries.remove(&id).expect("ID not found in entries. This is a bug.");
-        }
-    });
-}
-
-/// Send responses through the `entry` response channel
-fn send_responses(
-    generation: Generation,
-    entry: &Entry,
-) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
-    // Return directly if the channel is disconnected
-    if entry.response_tx.is_closed() {
-        metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
-        return Ok(true);
-    }
-
-    let mut stopped = false;
-
-    if let Some(prefill_tokens) = generation.prefill_tokens {
-        // Create Token objects
-        // We do that here instead of in the Python code as Rust for loops are faster
-        let prefill_tokens = prefill_tokens
-            .ids
-            .into_iter()
-            .zip(prefill_tokens.logprobs)
-            .zip(prefill_tokens.texts)
-            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
-            .collect();
-
-        // Send message
-        entry
-            .response_tx
-            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
-    }
-
-    // Create last Token
-    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
-    let n = tokens_.ids.len();
-    metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64);
-    let mut iterator = tokens_
-        .ids
-        .into_iter()
-        .zip(tokens_.logprobs)
-        .zip(tokens_.texts)
-        .zip(tokens_.is_special)
-        .enumerate()
-        .peekable();
-    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
-        let token = Token {
-            id,
-            text,
-            logprob,
-            special,
-        };
-        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
-            top_tokens_
-                .ids
-                .iter()
-                .zip(top_tokens_.logprobs.iter())
-                .zip(top_tokens_.texts.iter())
-                .zip(top_tokens_.is_special.iter())
-                .map(|(((&id, &logprob), text), &special)| Token {
-                    id,
-                    text: text.to_string(),
-                    logprob,
-                    special,
-                })
-                .collect()
-        } else {
-            vec![]
-        };
-        match (&generation.generated_text, iterator.peek()) {
-            (Some(generated_text), None) => {
-                // Generation has ended
-                stopped = true;
-                // Send message
-                entry.response_tx.send(Ok(InferStreamResponse::End {
-                    token,
-                    top_tokens,
-                    generated_text: GeneratedText::from(generated_text.clone()),
-                    queued: entry.queue_time,
-                    start: entry.batch_time.unwrap(),
-                }))?;
-            }
-            _ => {
-                // Send message
-                entry
-                    .response_tx
-                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
-            }
-        }
-    }
-
-    Ok(stopped)
-}
-
-/// Send errors to Infer for all `entries`
-#[instrument(skip_all)]
-fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
-    entries.drain().for_each(|(_, entry)| {
-        // Create and enter a span to link this function back to the entry
-        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
-        let err = InferError::GenerationError(error.to_string());
-        metrics::increment_counter!("tgi_request_failure", "err" => "generation");
-        tracing::error!("{err}");
-
-        // unwrap_or is valid here as we don't care if the receiver is gone.
-        entry
-            .response_tx
-            .send(Err(err))
-            .unwrap_or(());
-    });
-}
-
-impl From<text_generation_client::v2::GeneratedText> for GeneratedText {
-    fn from(value: text_generation_client::v2::GeneratedText) -> Self {
-        let v2_finish_reason =
-            text_generation_client::v2::FinishReason::try_from(value.finish_reason).unwrap();
-        let finish_reason = match v2_finish_reason {
-            text_generation_client::v2::FinishReason::Length => FinishReason::Length,
-            text_generation_client::v2::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
-            text_generation_client::v2::FinishReason::StopSequence => FinishReason::StopSequence,
-        };
-
-        Self {
-            text: value.text,
-            generated_tokens: value.generated_tokens,
-            finish_reason,
-            seed: value.seed,
-        }
-    }
-}
-
-// tests
-#[cfg(test)]
-mod tests {
-    use crate::infer::raise_exception;
-    use crate::{ChatTemplateInputs, TextMessage};
-    use minijinja::Environment;
-
-    #[test]
-    fn test_chat_template() {
-        let env = Environment::new();
-
-        let source = r#"
-        {% for message in messages %}
-            {% if message['role'] == 'system' %}
-                {% if message['content']%}
-                    {{'### System:\n' + message['content']+'\n\n'}}
-                {% endif %}
-            {% elif message['role'] == 'user' %}
-                {{'### User:\n' + message['content']+'\n\n'}}
-            {% elif message['role'] == 'assistant' %}
-                {{'### Assistant:\n'  + message['content']}}
-            {% endif %}
-            {% if loop.last and add_generation_prompt %}
-                {{ '### Assistant:\n' }}
-            {% endif %}
-        {% endfor %}"#;
-
-        // trim all the whitespace
-        let source = source
-            .lines()
-            .map(|line| line.trim())
-            .collect::<Vec<&str>>()
-            .join("");
-
-        let tmpl = env.template_from_str(&source);
-
-        let chat_template_inputs = ChatTemplateInputs {
-            messages: vec![
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "Hi!".to_string(),
-                },
-                TextMessage {
-                    role: "assistant".to_string(),
-                    content: "Hello how can I help?".to_string(),
-                },
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "What is Deep Learning?".to_string(),
-                },
-                TextMessage {
-                    role: "assistant".to_string(),
-                    content: "magic!".to_string(),
-                },
-            ],
-            bos_token: Some("[BOS]"),
-            eos_token: Some("[EOS]"),
-            add_generation_prompt: true,
-            ..Default::default()
-        };
-
-        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
-
-        assert_eq!(
-            result,
-            "### User:\nHi!\n\n### Assistant:\nHello how can I help?### User:\nWhat is Deep Learning?\n\n### Assistant:\nmagic!### Assistant:\n"
-        );
-    }
-
-    #[test]
-    fn test_chat_template_invalid_with_raise() {
-        let mut env = Environment::new();
-        env.add_function("raise_exception", raise_exception);
-
-        let source = r#"
-        {{ bos_token }}
-        {% for message in messages %}
-        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
-        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
-        {% endif %}
-        {% if message['role'] == 'user' %}
-        {{ '[INST] ' + message['content'] + ' [/INST]' }}
-        {% elif message['role'] == 'assistant' %}
-        {{ message['content'] + eos_token}}
-        {% else %}
-        {{ raise_exception('Only user and assistant roles are supported!') }}
-        {% endif %}
-        {% endfor %}"#;
-
-        // trim all the whitespace
-        let source = source
-            .lines()
-            .map(|line| line.trim())
-            .collect::<Vec<&str>>()
-            .join("");
-
-        let tmpl = env.template_from_str(&source);
-
-        let chat_template_inputs = ChatTemplateInputs {
-            messages: vec![
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "Hi!".to_string(),
-                },
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "Hi again!".to_string(),
-                },
-                TextMessage {
-                    role: "assistant".to_string(),
-                    content: "Hello how can I help?".to_string(),
-                },
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "What is Deep Learning?".to_string(),
-                },
-                TextMessage {
-                    role: "assistant".to_string(),
-                    content: "magic!".to_string(),
-                },
-            ],
-            bos_token: Some("[BOS]"),
-            eos_token: Some("[EOS]"),
-            add_generation_prompt: true,
-            ..Default::default()
-        };
-
-        let result = tmpl.unwrap().render(chat_template_inputs); //.err().unwrap();
-
-        match result {
-            Ok(_) => panic!("Should have failed"),
-            Err(e) => {
-                assert_eq!(
-                    e.detail().unwrap(),
-                    "Conversation roles must alternate user/assistant/user/assistant/..."
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn test_chat_template_valid_with_raise() {
-        let mut env = Environment::new();
-        env.add_function("raise_exception", raise_exception);
-
-        let source = r#"
-        {{ bos_token }}
-        {% for message in messages %}
-        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
-        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
-        {% endif %}
-        {% if message['role'] == 'user' %}
-        {{ '[INST] ' + message['content'] + ' [/INST]' }}
-        {% elif message['role'] == 'assistant' %}
-        {{ message['content'] + eos_token}}
-        {% else %}
-        {{ raise_exception('Only user and assistant roles are supported!') }}
-        {% endif %}
-        {% endfor %}"#;
-
-        // trim all the whitespace
-        let source = source
-            .lines()
-            .map(|line| line.trim())
-            .collect::<Vec<&str>>()
-            .join("");
-
-        let tmpl = env.template_from_str(&source);
-
-        let chat_template_inputs = ChatTemplateInputs {
-            messages: vec![
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "Hi!".to_string(),
-                },
-                TextMessage {
-                    role: "assistant".to_string(),
-                    content: "Hello how can I help?".to_string(),
-                },
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "What is Deep Learning?".to_string(),
-                },
-                TextMessage {
-                    role: "assistant".to_string(),
-                    content: "magic!".to_string(),
-                },
-            ],
-            bos_token: Some("[BOS]"),
-            eos_token: Some("[EOS]"),
-            add_generation_prompt: true,
-            ..Default::default()
-        };
-
-        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
-        assert_eq!(result, "[BOS][INST] Hi! [/INST]Hello how can I help?[EOS][INST] What is Deep Learning? [/INST]magic![EOS]");
-    }
-
-    #[test]
-    fn test_chat_template_valid_with_add_generation_prompt() {
-        let mut env = Environment::new();
-        env.add_function("raise_exception", raise_exception);
-
-        let source = r#"
-        {% for message in messages %}
-        {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
-        {% endfor %}
-        {% if add_generation_prompt %}
-            {{ '<|im_start|>assistant\n' }}
-        {% endif %}"#;
-
-        // trim all the whitespace
-        let source = source
-            .lines()
-            .map(|line| line.trim())
-            .collect::<Vec<&str>>()
-            .join("");
-
-        let tmpl = env.template_from_str(&source);
-
-        let chat_template_inputs = ChatTemplateInputs {
-            messages: vec![
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "Hi!".to_string(),
-                },
-                TextMessage {
-                    role: "assistant".to_string(),
-                    content: "Hello how can I help?".to_string(),
-                },
-                TextMessage {
-                    role: "user".to_string(),
-                    content: "What is Deep Learning?".to_string(),
-                },
-                TextMessage {
-                    role: "assistant".to_string(),
-                    content: "magic!".to_string(),
-                },
-            ],
-            bos_token: Some("[BOS]"),
-            eos_token: Some("[EOS]"),
-            add_generation_prompt: true,
-            ..Default::default()
-        };
-
-        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
-        assert_eq!(result, "<|im_start|>user\nHi!<|im_end|>\n<|im_start|>assistant\nHello how can I help?<|im_end|>\n<|im_start|>user\nWhat is Deep Learning?<|im_end|>\n<|im_start|>assistant\nmagic!<|im_end|>\n<|im_start|>assistant\n");
-    }
-
-    struct ChatTemplateTestItem {
-        name: &'static str,
-        chat_template: &'static str,
-        input: ChatTemplateInputs<'static>,
-        target: &'static str,
-    }
-
-    #[test]
-    fn test_many_chat_templates() {
-        let example_chat = vec![
-            TextMessage {
-                role: "user".to_string(),
-                content: "Hello, how are you?".to_string(),
-            },
-            TextMessage {
-                role: "assistant".to_string(),
-                content: "I'm doing great. How can I help you today?".to_string(),
-            },
-            TextMessage {
-                role: "user".to_string(),
-                content: "I'd like to show off how chat templating works!".to_string(),
-            },
-        ];
-
-        let example_chat_with_system = [TextMessage {
-            role: "system".to_string(),
-            content: "You are a friendly chatbot who always responds in the style of a pirate"
-                .to_string(),
-        }]
-        .iter()
-        .chain(&example_chat)
-        .cloned()
-        .collect::<Vec<_>>();
-
-        let test_default_templates = vec![
-            ChatTemplateTestItem {
-                name: "_base",
-                chat_template: "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some(""),
-                    ..Default::default()
-                },
-                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
-            },
-            ChatTemplateTestItem {
-                name: "blenderbot",
-                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "blenderbot_small",
-                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "bloom",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "gpt_neox",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("<|endoftext|>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
-            },
-            ChatTemplateTestItem {
-                name: "gpt2",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("<|endoftext|>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
-            },
-            ChatTemplateTestItem {
-                name: "llama",
-                // NOTE: the `.strip()` has been replaced with `| trim` in the following template
-                chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token +'[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content | trim + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat_with_system.clone(),
-                    add_generation_prompt: true,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>[INST] <<SYS>>\nYou are a friendly chatbot who always responds in the style of a pirate\n<</SYS>>\n\nHello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
-            },
-            ChatTemplateTestItem {
-                name: "whisper",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: true,
-                    bos_token: Some(""),
-                    eos_token: Some("<|endoftext|>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
-            },
-        ];
-
-        #[allow(unused_variables)] // name is unused
-        for ChatTemplateTestItem {
-            name,
-            chat_template,
-            input,
-            target,
-        } in test_default_templates
-        {
-            let mut env = Environment::new();
-            env.add_function("raise_exception", raise_exception);
-            let tmpl = env.template_from_str(chat_template);
-            let result = tmpl.unwrap().render(input).unwrap();
-            assert_eq!(result, target);
-        }
-
-        let test_custom_templates = vec![
-            ChatTemplateTestItem {
-                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=false)",
-                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat_with_system.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHello, how are you?</s><|assistant|>\nI'm doing great. How can I help you today?</s><|user|>\nI'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=true)",
-                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: vec![
-                        TextMessage {
-                            role: "system".to_string(),
-                            content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(),
-                        },
-                        TextMessage {
-                            role: "user".to_string(),
-                            content: "How many helicopters can a human eat in one sitting?".to_string(),
-                        },
-                    ],
-                    add_generation_prompt: true,
-                    bos_token: Some(""),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHow many helicopters can a human eat in one sitting?</s><|assistant|>",
-            },
-            ChatTemplateTestItem {
-                name: "HuggingFaceH4/zephyr-7b-gemma-v0.1",
-                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<bos>"),
-                    eos_token: Some("<eos>"),
-                    ..Default::default()
-                },
-                target: "<bos><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
-            },
-            ChatTemplateTestItem {
-                name: "mistralai/Mistral-7B-Instruct-v0.1",
-                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]",
-            },
-            ChatTemplateTestItem {
-                name: "mistralai/Mixtral-8x7B-Instruct-v0.1",
-                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s>[INST] I'd like to show off how chat templating works! [/INST]",
-            },
-            ChatTemplateTestItem {
-                name: "cognitivecomputations/dolphin-2.5-mixtral-8x7b",
-                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
-            },
-            ChatTemplateTestItem {
-                name: "openchat/openchat-3.5-0106",
-                // `.title()` has been replaced with `| upper` in the following template
-                chat_template: "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + (message['role'] | title) + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>",
-            },
-            ChatTemplateTestItem {
-                name: "upstage/SOLAR-10.7B-Instruct-v1.0",
-                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "codellama/CodeLlama-70b-Instruct-hf",
-                // NOTE: `.strip()` has been replaced with `| trim` in the following template
-                chat_template: "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\\n\\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\\nDestination: user\\n\\n '}}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s>Source: user\n\n Hello, how are you? <step> Source: assistant\n\n I'm doing great. How can I help you today? <step> Source: user\n\n I'd like to show off how chat templating works! <step> Source: assistant\nDestination: user\n\n ",
-            },
-            ChatTemplateTestItem {
-                name: "Deci/DeciLM-7B-instruct",
-                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "### User:\nHello, how are you?### Assistant:\nI'm doing great. How can I help you today?### User:\nI'd like to show off how chat templating works!",
-            },
-            ChatTemplateTestItem {
-                name: "Qwen/Qwen1.5-72B-Chat",
-                chat_template: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!",
-            },
-            ChatTemplateTestItem {
-                name: "deepseek-ai/deepseek-llm-7b-chat",
-                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\\n\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<｜begin▁of▁sentence｜>"),
-                    eos_token: Some("<｜end▁of▁sentence｜>"),
-                    ..Default::default()
-                },
-                target: "<｜begin▁of▁sentence｜>User: Hello, how are you?\n\nAssistant: I'm doing great. How can I help you today?<｜end▁of▁sentence｜>User: I'd like to show off how chat templating works!\n\n",
-            },
-            ChatTemplateTestItem {
-                name: "h2oai/h2o-danube-1.8b-chat",
-                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<|prompt|>Hello, how are you?</s><|answer|>I'm doing great. How can I help you today?</s><|prompt|>I'd like to show off how chat templating works!</s>",
-            },
-            ChatTemplateTestItem {
-                name: "internlm/internlm2-chat-7b",
-                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "<s><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
-            },
-            ChatTemplateTestItem {
-                name: "TheBloke/deepseek-coder-33B-instruct-AWQ",
-                chat_template: "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<｜begin▁of▁sentence｜>"),
-                    eos_token: Some("<|EOT|>"),
-                    ..Default::default()
-                },
-                target: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n### Response:\n",
-            },
-            ChatTemplateTestItem {
-                name: "ericzzz/falcon-rw-1b-chat",
-                // `.strip()` has been replaced with `| trim` in the following template
-                chat_template: "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'] | trim }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] | trim }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<|endoftext|>"),
-                    eos_token: Some("<|endoftext|>"),
-                    ..Default::default()
-                },
-                target: "[INST] Hello, how are you? [RESP] I'm doing great. How can I help you today?<|endoftext|>[INST] I'd like to show off how chat templating works!",
-            },
-            ChatTemplateTestItem {
-                name: "abacusai/Smaug-34B-v0.1",
-                chat_template: "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "Hello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
-            },
-            ChatTemplateTestItem {
-                name: "maywell/Synatra-Mixtral-8x7B",
-                chat_template: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "Below is an instruction that describes a task. Write a response that appropriately completes the request.### Instruction:Hello, how are you?### Response:I'm doing great. How can I help you today?### Instruction:I'd like to show off how chat templating works!",
-            },
-            ChatTemplateTestItem {
-                name: "deepseek-ai/deepseek-coder-33b-instruct",
-                chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<｜begin▁of▁sentence｜>"),
-                    eos_token: Some("</EOT>"),
-                    ..Default::default()
-                },
-                target: "<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n",
-            },
-            // NOT INCLUDED
-            // - meetkai/functionary-medium-v2.2
-            // - fireworks-ai/firefunction-v1
-            // https://github
-            ChatTemplateTestItem {
-                name: "maywell/PiVoT-MoE",
-                chat_template: "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content']|trim }}{% elif message['role'] == 'user' %}### Instruction: {{ message['content']|trim }}{% elif message['role'] == 'assistant' %}### Response: {{ message['content']|trim }}{% elif message['role'] == 'user_context' %}### Input: {{ message['content']|trim }}{% endif %}{% if not loop.last %}\n{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}### Response:{% endif %}",
-                input: ChatTemplateInputs {
-                    messages: example_chat_with_system.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    ..Default::default()
-                },
-                target: "You are a friendly chatbot who always responds in the style of a pirateYou are a friendly chatbot who always responds in the style of a pirate### Instruction: Hello, how are you?### Response: I'm doing great. How can I help you today?### Instruction: I'd like to show off how chat templating works!",
-            },
-        ];
-
-        #[allow(unused_variables)] // name is unused
-        for ChatTemplateTestItem {
-            name,
-            chat_template,
-            input,
-            target,
-        } in test_custom_templates
-        {
-            let mut env = Environment::new();
-            env.add_function("raise_exception", raise_exception);
-            // trim all the whitespace
-            let chat_template = chat_template
-                .lines()
-                .map(|line| line.trim())
-                .collect::<Vec<&str>>()
-                .join("");
-
-            let tmpl = env.template_from_str(&chat_template);
-            let result = tmpl.unwrap().render(input).unwrap();
-            assert_eq!(result, target);
-        }
-    }
-}
diff --git a/router/src/infer/v3/block_allocator.rs b/router/src/infer/v3/block_allocator.rs
deleted file mode 100644
index 7467fd85997528cc7565787c9ac8c5a728193c69..0000000000000000000000000000000000000000
--- a/router/src/infer/v3/block_allocator.rs
+++ /dev/null
@@ -1,136 +0,0 @@
-use std::cmp::min;
-use tokio::sync::{mpsc, oneshot};
-
-#[derive(Debug, Clone)]
-pub(crate) struct BlockAllocation {
-    pub blocks: Vec<u32>,
-    pub slots: Vec<u32>,
-    block_allocator: BlockAllocator,
-}
-
-impl Drop for BlockAllocation {
-    fn drop(&mut self) {
-        self.block_allocator.free(self.blocks.clone())
-    }
-}
-
-#[derive(Debug, Clone)]
-pub(crate) struct BlockAllocator {
-    /// Channel to communicate with the background task
-    block_allocator: mpsc::UnboundedSender<BlockAllocatorCommand>,
-}
-
-impl BlockAllocator {
-    pub(crate) fn new(
-        max_batch_total_tokens: u32,
-        block_size: u32,
-        window_size: Option<u32>,
-    ) -> Self {
-        // Create channel
-        let (sender, receiver) = mpsc::unbounded_channel();
-
-        // Launch background queue task
-        tokio::spawn(block_allocator_task(
-            max_batch_total_tokens / block_size,
-            block_size,
-            window_size,
-            receiver,
-        ));
-
-        Self {
-            block_allocator: sender,
-        }
-    }
-
-    pub(crate) async fn allocate(&self, tokens: u32) -> Option<BlockAllocation> {
-        let (response_sender, response_receiver) = oneshot::channel();
-        self.block_allocator
-            .send(BlockAllocatorCommand::Allocate {
-                tokens,
-                response_sender,
-            })
-            .unwrap();
-
-        response_receiver
-            .await
-            .unwrap()
-            .map(|(blocks, slots)| BlockAllocation {
-                blocks,
-                slots,
-                block_allocator: self.clone(),
-            })
-    }
-
-    pub(crate) fn free(&self, blocks: Vec<u32>) {
-        self.block_allocator
-            .send(BlockAllocatorCommand::Free { blocks })
-            .unwrap();
-    }
-}
-
-async fn block_allocator_task(
-    blocks: u32,
-    block_size: u32,
-    window_size: Option<u32>,
-    mut receiver: mpsc::UnboundedReceiver<BlockAllocatorCommand>,
-) {
-    // Block 0 is reserved for health checks
-    let mut free_blocks: Vec<u32> = (1..blocks).collect();
-    while let Some(cmd) = receiver.recv().await {
-        match cmd {
-            BlockAllocatorCommand::Free { blocks } => free_blocks.extend(blocks),
-            BlockAllocatorCommand::Allocate {
-                tokens,
-                response_sender,
-            } => {
-                // Apply window size
-                let (required_blocks, repeats) = {
-                    let (tokens, repeats) = match window_size {
-                        None => (tokens, 1),
-                        Some(window_size) => {
-                            let repeats = (tokens + window_size - 1) / window_size;
-                            let tokens = min(tokens, window_size);
-                            (tokens, repeats as usize)
-                        }
-                    };
-                    // Pad to a multiple of block size
-                    let required_blocks = (tokens + block_size - 1) / block_size;
-                    (required_blocks, repeats)
-                };
-
-                let tokens = tokens as usize;
-                let allocation = if required_blocks > free_blocks.len() as u32 {
-                    None
-                } else {
-                    let blocks =
-                        free_blocks.split_off(free_blocks.len() - required_blocks as usize);
-                    let mut slots = Vec::with_capacity(
-                        (required_blocks * block_size * repeats as u32) as usize,
-                    );
-
-                    'slots: for block_id in blocks.repeat(repeats).iter() {
-                        for s in (block_id * block_size)..((block_id + 1) * block_size) {
-                            slots.push(s);
-                            if slots.len() == tokens {
-                                break 'slots;
-                            }
-                        }
-                    }
-                    Some((blocks, slots))
-                };
-                response_sender.send(allocation).unwrap();
-            }
-        }
-    }
-}
-
-#[derive(Debug)]
-enum BlockAllocatorCommand {
-    Free {
-        blocks: Vec<u32>,
-    },
-    Allocate {
-        tokens: u32,
-        response_sender: oneshot::Sender<Option<(Vec<u32>, Vec<u32>)>>,
-    },
-}
diff --git a/router/src/infer/v3/mod.rs b/router/src/infer/v3/mod.rs
deleted file mode 100644
index f9effab8e8fb0424ca121a0449854eb2643fa250..0000000000000000000000000000000000000000
--- a/router/src/infer/v3/mod.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-mod block_allocator;
-mod queue;
-mod scheduler;
-
-pub(crate) use scheduler::SchedulerV3;
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 165b2ad2fb1786edd9cac9478361056f3e628975..7c40c7e3dd6b47691ec304cc6fe31007ce2a4a19 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -1,36 +1,24 @@
 /// Text Generation Inference Webserver
 pub mod config;
-mod infer;
+pub mod infer;
 pub mod server;
-mod validation;
+pub mod validation;
 
 #[cfg(feature = "kserve")]
 mod kserve;
+pub mod logging;
 
+mod sagemaker;
+pub mod usage_stats;
+mod vertex;
+
+use crate::infer::{Infer, InferError};
+use crate::server::prepare_chat_input;
 use serde::{Deserialize, Serialize};
 use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;
 
-#[derive(Clone, Deserialize, ToSchema)]
-pub(crate) struct VertexInstance {
-    #[schema(example = "What is Deep Learning?")]
-    pub inputs: String,
-    #[schema(nullable = true, default = "null", example = "null")]
-    pub parameters: Option<GenerateParameters>,
-}
-
-#[derive(Deserialize, ToSchema)]
-pub(crate) struct VertexRequest {
-    #[serde(rename = "instances")]
-    pub instances: Vec<VertexInstance>,
-}
-
-#[derive(Clone, Deserialize, ToSchema, Serialize)]
-pub(crate) struct VertexResponse {
-    pub predictions: Vec<String>,
-}
-
 /// Hub type
 #[derive(Clone, Debug, Deserialize)]
 pub struct HubModelInfo {
@@ -40,13 +28,13 @@ pub struct HubModelInfo {
     pub pipeline_tag: Option<String>,
 }
 
-#[derive(Debug, Clone, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct ChatTemplate {
     name: String,
     template: String,
 }
 
-#[derive(Debug, Clone, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 #[serde(untagged)]
 pub enum ChatTemplateVersions {
     Single(String),
@@ -55,7 +43,7 @@ pub enum ChatTemplateVersions {
 
 use std::path::Path;
 
-#[derive(Debug, Clone, Deserialize, Default)]
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct HubTokenizerConfig {
     pub chat_template: Option<ChatTemplateVersions>,
     pub completion_template: Option<String>,
@@ -125,6 +113,7 @@ impl HubProcessorConfig {
 }
 
 #[derive(Clone, Debug, Deserialize, ToSchema, Serialize)]
+#[cfg_attr(test, derive(PartialEq))]
 #[serde(tag = "type", content = "value")]
 pub(crate) enum GrammarType {
     /// A string that represents a [JSON Schema](https://json-schema.org/).
@@ -146,12 +135,13 @@ pub struct Info {
     pub model_id: String,
     #[schema(nullable = true, example = "e985a63cdc139290c5f700ff1929f0b5942cced2")]
     pub model_sha: Option<String>,
-    #[schema(example = "torch.float16")]
-    pub model_dtype: String,
-    #[schema(example = "cuda")]
-    pub model_device_type: String,
+    // #[schema(example = "torch.float16")]
+    // pub model_dtype: String,
+    // #[schema(example = "cuda")]
+    // pub model_device_type: String,
     #[schema(nullable = true, example = "text-generation")]
     pub model_pipeline_tag: Option<String>,
+
     /// Router Parameters
     #[schema(example = "128")]
     pub max_concurrent_requests: usize,
@@ -163,18 +153,11 @@ pub struct Info {
     pub max_input_tokens: usize,
     #[schema(example = "2048")]
     pub max_total_tokens: usize,
-    #[schema(example = "1.2")]
-    pub waiting_served_ratio: f32,
-    #[schema(example = "32000")]
-    pub max_batch_total_tokens: u32,
-    #[schema(example = "20")]
-    pub max_waiting_tokens: usize,
-    #[schema(nullable = true, example = "null")]
-    pub max_batch_size: Option<usize>,
     #[schema(example = "2")]
     pub validation_workers: usize,
     #[schema(example = "32")]
     pub max_client_batch_size: usize,
+
     /// Router Info
     #[schema(example = "text-generation-router")]
     pub router: &'static str,
@@ -187,6 +170,7 @@ pub struct Info {
 }
 
 #[derive(Clone, Debug, Deserialize, ToSchema, Default)]
+#[cfg_attr(test, derive(PartialEq))]
 pub(crate) struct GenerateParameters {
     /// Generate best_of sequences and return the one if the highest token logprobs.
     #[serde(default)]
@@ -384,7 +368,7 @@ pub struct CompletionRequest {
     /// UNUSED
     #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
     /// ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
-    pub model: String,
+    pub model: Option<String>,
 
     /// The prompt to generate completions for.
     #[schema(example = "What is Deep Learning?")]
@@ -622,7 +606,7 @@ impl ChatCompletion {
                 message,
                 logprobs: return_logprobs
                     .then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))),
-                finish_reason: details.finish_reason.to_string(),
+                finish_reason: details.finish_reason.format(true),
             }],
             usage: Usage {
                 prompt_tokens: details.prefill.len() as u32,
@@ -641,6 +625,7 @@ pub(crate) struct ChatCompletionChunk {
     pub model: String,
     pub system_fingerprint: String,
     pub choices: Vec<ChatCompletionChoice>,
+    pub usage: Option<Usage>,
 }
 
 #[derive(Clone, Serialize, ToSchema)]
@@ -689,6 +674,7 @@ impl ChatCompletionChunk {
         created: u64,
         logprobs: Option<ChatCompletionLogprobs>,
         finish_reason: Option<String>,
+        usage: Option<Usage>,
     ) -> Self {
         let delta = match (delta, tool_calls) {
             (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
@@ -723,15 +709,17 @@ impl ChatCompletionChunk {
                 logprobs,
                 finish_reason,
             }],
+            usage,
         }
     }
 }
 
 #[derive(Clone, Deserialize, ToSchema, Serialize)]
+#[cfg_attr(test, derive(Debug, PartialEq, Default))]
 pub(crate) struct ChatRequest {
     #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
     /// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
-    pub model: String,
+    pub model: Option<String>,
 
     /// A list of messages comprising the conversation so far.
     #[schema(example = "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]")]
@@ -814,17 +802,17 @@ pub(crate) struct ChatRequest {
     pub tools: Option<Vec<Tool>>,
 
     /// A prompt to be appended before the tools
-    #[serde(default = "default_tool_prompt")]
+    #[serde(default)]
     #[schema(
         nullable = true,
-        example = "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\""
+        example = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables."
     )]
     pub tool_prompt: Option<String>,
 
     /// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter.
     #[serde(default)]
     #[schema(nullable = true, example = "null")]
-    pub tool_choice: Option<ToolType>,
+    pub tool_choice: ToolChoice,
 
     /// Response format constraints for the generation.
     ///
@@ -832,54 +820,152 @@ pub(crate) struct ChatRequest {
     #[serde(default)]
     #[schema(nullable = true, default = "null", example = "null")]
     pub response_format: Option<GrammarType>,
+
+    /// A guideline to be used in the chat_template
+    #[serde(default)]
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub guideline: Option<String>,
+
+    /// Options for streaming response. Only set this when you set stream: true.
+    #[serde(default)]
+    #[schema(nullable = true, example = "null")]
+    pub stream_options: Option<StreamOptions>,
+}
+
+impl ChatRequest {
+    fn try_into_generate(self, infer: &Infer) -> Result<(GenerateRequest, bool), InferError> {
+        let ChatRequest {
+            model,
+            max_tokens,
+            messages,
+            seed,
+            stop,
+            stream,
+            tools,
+            tool_choice,
+            tool_prompt,
+            temperature,
+            response_format,
+            guideline,
+            presence_penalty,
+            frequency_penalty,
+            top_p,
+            top_logprobs,
+            ..
+        } = self;
+
+        let repetition_penalty = presence_penalty.map(|x| x + 2.0);
+        let max_new_tokens = max_tokens.or(Some(100));
+        let tool_prompt = tool_prompt
+            .filter(|s| !s.is_empty())
+            .unwrap_or_else(default_tool_prompt);
+        let stop = stop.unwrap_or_default();
+        // enable greedy only when temperature is 0
+        let (do_sample, temperature) = match temperature {
+            Some(temperature) if temperature == 0.0 => (false, None),
+            other => (true, other),
+        };
+        let (inputs, grammar, using_tools) = prepare_chat_input(
+            infer,
+            response_format,
+            tools,
+            tool_choice,
+            &tool_prompt,
+            guideline,
+            messages,
+        )?;
+
+        Ok((
+            GenerateRequest {
+                inputs: inputs.to_string(),
+                add_special_tokens: false,
+                parameters: GenerateParameters {
+                    best_of: None,
+                    temperature,
+                    repetition_penalty,
+                    frequency_penalty,
+                    top_k: None,
+                    top_p,
+                    typical_p: None,
+                    do_sample,
+                    max_new_tokens,
+                    return_full_text: None,
+                    stop,
+                    truncate: None,
+                    watermark: false,
+                    details: true,
+                    decoder_input_details: !stream,
+                    seed,
+                    top_n_tokens: top_logprobs,
+                    grammar,
+                    adapter_id: model.filter(|m| *m != "tgi").map(String::from),
+                },
+            },
+            using_tools,
+        ))
+    }
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
+struct StreamOptions {
+    /// If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
+    #[schema(example = "true")]
+    include_usage: bool,
 }
 
-fn default_tool_prompt() -> Option<String> {
-    Some(
-        "\nYou will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n".to_string(),
-    )
+pub fn default_tool_prompt() -> String {
+    "\nGiven the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.\n".to_string()
 }
 
 #[derive(Clone, Debug, Deserialize, PartialEq, Serialize, ToSchema)]
-#[serde(untagged)]
+#[schema(example = "auto")]
+/// Controls which (if any) tool is called by the model.
 pub enum ToolType {
+    /// Means the model can pick between generating a message or calling one or more tools.
+    #[schema(rename = "auto")]
     OneOf,
-    FunctionName(String),
-    Function { function: FunctionName },
+    /// Means the model will not call any tool and instead generates a message.
+    #[schema(rename = "none")]
+    NoTool,
+    /// Forces the model to call a specific tool.
+    #[schema(rename = "function")]
+    Function(FunctionName),
 }
 
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, ToSchema)]
 pub struct FunctionName {
     pub name: String,
 }
 
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default, ToSchema)]
 #[serde(from = "ToolTypeDeserializer")]
 pub struct ToolChoice(pub Option<ToolType>);
 
 #[derive(Deserialize)]
 #[serde(untagged)]
 enum ToolTypeDeserializer {
-    None(Option<String>),
-    Some(ToolType),
+    Null,
+    String(String),
+    ToolType(ToolType),
 }
 
 impl From<ToolTypeDeserializer> for ToolChoice {
     fn from(value: ToolTypeDeserializer) -> Self {
         match value {
-            ToolTypeDeserializer::None(opt) => match opt.as_deref() {
-                Some("none") => ToolChoice(None),
-                Some("auto") => ToolChoice(Some(ToolType::OneOf)),
-                Some(s) => ToolChoice(Some(ToolType::FunctionName(s.to_string()))),
-                None => ToolChoice(Some(ToolType::OneOf)),
+            ToolTypeDeserializer::Null => ToolChoice(None),
+            ToolTypeDeserializer::String(s) => match s.as_str() {
+                "none" => ToolChoice(Some(ToolType::NoTool)),
+                "auto" => ToolChoice(Some(ToolType::OneOf)),
+                _ => ToolChoice(Some(ToolType::Function(FunctionName { name: s }))),
             },
-            ToolTypeDeserializer::Some(tool_type) => ToolChoice(Some(tool_type)),
+            ToolTypeDeserializer::ToolType(tool_type) => ToolChoice(Some(tool_type)),
         }
     }
 }
 
 #[derive(Debug, Deserialize, Serialize, ToSchema, PartialEq)]
-pub struct Tools {
+pub struct JsonSchemaTool {
     #[serde(flatten)]
     functions_map: FunctionsMap,
     properties: Properties,
@@ -923,6 +1009,7 @@ pub(crate) struct FunctionDefinition {
 }
 
 #[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
+#[cfg_attr(test, derive(PartialEq))]
 pub(crate) struct Tool {
     // The type of the tool. Currently, only 'function' is supported.
     #[schema(example = "function")]
@@ -937,8 +1024,8 @@ pub(crate) struct ChatTemplateInputs<'a> {
     bos_token: Option<&'a str>,
     eos_token: Option<&'a str>,
     add_generation_prompt: bool,
-    tools: Option<&'a str>,
-    tools_prompt: Option<&'a str>,
+    tools: Option<Vec<Tool>>,
+    guideline: Option<&'a str>,
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
@@ -984,8 +1071,10 @@ impl MessageContent {
     pub fn push(&mut self, chunk: MessageChunk) {
         match self {
             MessageContent::SingleText(text) => {
-                *self =
-                    MessageContent::MultipleChunks(vec![MessageChunk::Text { text: text.clone() }]);
+                *self = MessageContent::MultipleChunks(vec![
+                    MessageChunk::Text { text: text.clone() },
+                    chunk,
+                ]);
             }
             MessageContent::MultipleChunks(chunks) => {
                 chunks.push(chunk);
@@ -1041,6 +1130,16 @@ pub(crate) struct GenerateRequest {
     pub inputs: String,
     #[serde(default = "default_parameters")]
     pub parameters: GenerateParameters,
+
+    /// This is used internally because some requests
+    /// already contain the templated input therefore
+    /// we shouldn't add the special tokens.
+    #[serde(default = "default_true", skip)]
+    pub add_special_tokens: bool,
+}
+
+fn default_true() -> bool {
+    true
 }
 
 #[derive(Clone, Debug, Deserialize, ToSchema)]
@@ -1058,6 +1157,7 @@ impl From<CompatGenerateRequest> for GenerateRequest {
     fn from(req: CompatGenerateRequest) -> Self {
         Self {
             inputs: req.inputs,
+            add_special_tokens: true,
             parameters: req.parameters,
         }
     }
@@ -1066,23 +1166,23 @@ impl From<CompatGenerateRequest> for GenerateRequest {
 #[derive(Debug, Serialize, ToSchema)]
 pub struct PrefillToken {
     #[schema(example = 0)]
-    id: u32,
+    pub id: u32,
     #[schema(example = "test")]
-    text: String,
+    pub text: String,
     #[schema(nullable = true, example = - 0.34)]
-    logprob: f32,
+    pub logprob: f32,
 }
 
 #[derive(Debug, Serialize, ToSchema, Clone)]
 pub struct Token {
     #[schema(example = 0)]
-    id: u32,
+    pub id: u32,
     #[schema(example = "test")]
-    text: String,
+    pub text: String,
     #[schema(nullable = true, example = - 0.34)]
-    logprob: f32,
+    pub logprob: f32,
     #[schema(example = "false")]
-    special: bool,
+    pub special: bool,
 }
 
 #[derive(Debug, Serialize, ToSchema)]
@@ -1100,7 +1200,7 @@ pub struct SimpleToken {
 #[derive(Debug, Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
 #[schema(example = "Length")]
-pub(crate) enum FinishReason {
+pub enum FinishReason {
     #[schema(rename = "length")]
     Length,
     #[serde(rename = "eos_token")]
@@ -1120,6 +1220,15 @@ impl std::fmt::Display for FinishReason {
     }
 }
 
+impl FinishReason {
+    pub fn format(&self, use_stop: bool) -> String {
+        match self {
+            FinishReason::EndOfSequenceToken if use_stop => "stop".to_string(),
+            _ => self.to_string(),
+        }
+    }
+}
+
 #[derive(Serialize, ToSchema)]
 pub(crate) struct BestOfSequence {
     #[schema(example = "test")]
@@ -1160,6 +1269,12 @@ pub(crate) struct GenerateResponse {
     pub details: Option<Details>,
 }
 
+#[derive(Serialize, ToSchema)]
+pub(crate) struct ChatTokenizeResponse {
+    pub(crate) tokenize_response: TokenizeResponse,
+    pub(crate) templated_text: String,
+}
+
 #[derive(Serialize, ToSchema)]
 #[serde(transparent)]
 pub(crate) struct TokenizeResponse(Vec<SimpleToken>);
@@ -1172,6 +1287,8 @@ pub(crate) struct StreamDetails {
     pub generated_tokens: u32,
     #[schema(nullable = true, example = 42)]
     pub seed: Option<u64>,
+    #[schema(example = 1)]
+    pub input_length: u32,
 }
 
 #[derive(Serialize, ToSchema)]
@@ -1192,6 +1309,34 @@ pub(crate) struct ErrorResponse {
     pub error_type: String,
 }
 
+#[derive(Serialize, Deserialize, ToSchema)]
+pub(crate) struct ModelInfo {
+    #[schema(example = "gpt2")]
+    pub id: String,
+    #[schema(example = "model")]
+    pub object: String,
+    #[schema(example = 1686935002)]
+    pub created: u64,
+    #[schema(example = "openai")]
+    pub owned_by: String,
+}
+
+#[derive(Serialize, Deserialize, ToSchema)]
+pub(crate) struct ModelsInfo {
+    #[schema(example = "list")]
+    pub object: String,
+    pub data: Vec<ModelInfo>,
+}
+
+impl Default for ModelsInfo {
+    fn default() -> Self {
+        ModelsInfo {
+            object: "list".to_string(),
+            data: Vec::new(),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1299,6 +1444,35 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_message_content_append() {
+        let mut content = MessageContent::SingleText("Initial text".to_string());
+        let chunk = MessageChunk::Text {
+            text: "Additional text".to_string(),
+        };
+
+        content.push(chunk);
+
+        match content {
+            MessageContent::MultipleChunks(chunks) => {
+                assert_eq!(chunks.len(), 2);
+                assert_eq!(
+                    chunks[0],
+                    MessageChunk::Text {
+                        text: "Initial text".to_string()
+                    }
+                );
+                assert_eq!(
+                    chunks[1],
+                    MessageChunk::Text {
+                        text: "Additional text".to_string()
+                    }
+                );
+            }
+            _ => panic!("Expected MultipleChunks, but got a different variant"),
+        }
+    }
+
     #[test]
     fn test_chat_request() {
         let json = json!({
@@ -1339,6 +1513,27 @@ mod tests {
         let textmsg: TextMessage = message.into();
         assert_eq!(textmsg.content, "Whats in this image?![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)");
     }
+
+    #[test]
+    fn test_chat_stream_options() {
+        let json = json!({
+            "model": "",
+            "stream_options": {"include_usage": true},
+            "messages": [{
+                "role": "user",
+                "content": "Hello"
+            }]
+        });
+        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
+
+        assert!(matches!(
+            request.stream_options,
+            Some(StreamOptions {
+                include_usage: true
+            })
+        ));
+    }
+
     #[test]
     fn openai_output() {
         let message = OutputMessage::ChatMessage(TextMessage {
diff --git a/router/src/logging.rs b/router/src/logging.rs
new file mode 100644
index 0000000000000000000000000000000000000000..5a98ef57b932cce6f5810acb6e6dfdc2fe9773ac
--- /dev/null
+++ b/router/src/logging.rs
@@ -0,0 +1,81 @@
+use opentelemetry::sdk::propagation::TraceContextPropagator;
+use opentelemetry::sdk::trace;
+use opentelemetry::sdk::trace::Sampler;
+use opentelemetry::sdk::Resource;
+use opentelemetry::{global, KeyValue};
+use opentelemetry_otlp::WithExportConfig;
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::util::SubscriberInitExt;
+use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
+
+/// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
+///     - otlp_endpoint is an optional URL to an Open Telemetry collector
+///     - otlp_service_name service name to appear in APM
+///     - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
+///     - LOG_FORMAT may be TEXT or JSON (default to TEXT)
+///     - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
+pub fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
+    let mut layers = Vec::new();
+
+    // STDOUT/STDERR layer
+    let ansi = std::env::var("LOG_COLORIZE") != Ok("1".to_string());
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_file(true)
+        .with_ansi(ansi)
+        .with_line_number(true);
+
+    let fmt_layer = match json_output {
+        true => fmt_layer.json().flatten_event(true).boxed(),
+        false => fmt_layer.boxed(),
+    };
+    layers.push(fmt_layer);
+
+    // OpenTelemetry tracing layer
+    if let Some(otlp_endpoint) = otlp_endpoint {
+        global::set_text_map_propagator(TraceContextPropagator::new());
+
+        let tracer = opentelemetry_otlp::new_pipeline()
+            .tracing()
+            .with_exporter(
+                opentelemetry_otlp::new_exporter()
+                    .tonic()
+                    .with_endpoint(otlp_endpoint),
+            )
+            .with_trace_config(
+                trace::config()
+                    .with_resource(Resource::new(vec![KeyValue::new(
+                        "service.name",
+                        otlp_service_name,
+                    )]))
+                    .with_sampler(Sampler::AlwaysOn),
+            )
+            .install_batch(opentelemetry::runtime::Tokio);
+
+        if let Ok(tracer) = tracer {
+            layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
+            init_tracing_opentelemetry::init_propagator().unwrap();
+        };
+    }
+
+    // Filter events with LOG_LEVEL
+    let varname = "LOG_LEVEL";
+    let env_filter = if let Ok(log_level) = std::env::var(varname) {
+        // Override to avoid simple logs to be spammed with tokio level informations
+        let log_level = match &log_level[..] {
+            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
+            "info" => "text_generation_launcher=info,text_generation_router=info",
+            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
+            log_level => log_level,
+        };
+        EnvFilter::builder()
+            .with_default_directive(LevelFilter::INFO.into())
+            .parse_lossy(log_level)
+    } else {
+        EnvFilter::new("info")
+    };
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(layers)
+        .init();
+}
diff --git a/router/src/main.rs b/router/src/main.rs
deleted file mode 100644
index 21cd66496daef03ffede3140587dda270ce694c1..0000000000000000000000000000000000000000
--- a/router/src/main.rs
+++ /dev/null
@@ -1,660 +0,0 @@
-use axum::http::HeaderValue;
-use clap::Parser;
-use clap::Subcommand;
-use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
-use hf_hub::{Cache, Repo, RepoType};
-use opentelemetry::sdk::propagation::TraceContextPropagator;
-use opentelemetry::sdk::trace;
-use opentelemetry::sdk::trace::Sampler;
-use opentelemetry::sdk::Resource;
-use opentelemetry::{global, KeyValue};
-use opentelemetry_otlp::WithExportConfig;
-use std::fs::File;
-use std::io::BufReader;
-use std::net::{IpAddr, Ipv4Addr, SocketAddr};
-use std::path::{Path, PathBuf};
-use text_generation_router::config::Config;
-use text_generation_router::{
-    server, HubModelInfo, HubPreprocessorConfig, HubProcessorConfig, HubTokenizerConfig,
-};
-use thiserror::Error;
-use tokenizers::{processors::template::TemplateProcessing, Tokenizer};
-use tower_http::cors::AllowOrigin;
-use tracing_subscriber::layer::SubscriberExt;
-use tracing_subscriber::util::SubscriberInitExt;
-use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
-
-/// App Configuration
-#[derive(Parser, Debug)]
-#[clap(author, version, about, long_about = None)]
-struct Args {
-    #[command(subcommand)]
-    command: Option<Commands>,
-
-    #[clap(default_value = "128", long, env)]
-    max_concurrent_requests: usize,
-    #[clap(default_value = "2", long, env)]
-    max_best_of: usize,
-    #[clap(default_value = "4", long, env)]
-    max_stop_sequences: usize,
-    #[clap(default_value = "5", long, env)]
-    max_top_n_tokens: u32,
-    #[clap(default_value = "1024", long, env)]
-    max_input_tokens: usize,
-    #[clap(default_value = "2048", long, env)]
-    max_total_tokens: usize,
-    #[clap(default_value = "1.2", long, env)]
-    waiting_served_ratio: f32,
-    #[clap(default_value = "4096", long, env)]
-    max_batch_prefill_tokens: u32,
-    #[clap(long, env)]
-    max_batch_total_tokens: Option<u32>,
-    #[clap(default_value = "20", long, env)]
-    max_waiting_tokens: usize,
-    #[clap(long, env)]
-    max_batch_size: Option<usize>,
-    #[clap(default_value = "0.0.0.0", long, env)]
-    hostname: String,
-    #[clap(default_value = "3000", long, short, env)]
-    port: u16,
-    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
-    master_shard_uds_path: String,
-    #[clap(default_value = "bigscience/bloom", long, env)]
-    tokenizer_name: String,
-    #[clap(long, env)]
-    tokenizer_config_path: Option<String>,
-    #[clap(long, env)]
-    revision: Option<String>,
-    #[clap(default_value = "2", long, env)]
-    validation_workers: usize,
-    #[clap(long, env)]
-    json_output: bool,
-    #[clap(long, env)]
-    otlp_endpoint: Option<String>,
-    #[clap(default_value = "text-generation-inference.router", long, env)]
-    otlp_service_name: String,
-    #[clap(long, env)]
-    cors_allow_origin: Option<Vec<String>>,
-    #[clap(long, env)]
-    ngrok: bool,
-    #[clap(long, env)]
-    ngrok_authtoken: Option<String>,
-    #[clap(long, env)]
-    ngrok_edge: Option<String>,
-    #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
-    disable_grammar_support: bool,
-    #[clap(default_value = "4", long, env)]
-    max_client_batch_size: usize,
-}
-
-#[derive(Debug, Subcommand)]
-enum Commands {
-    PrintSchema,
-}
-
-#[tokio::main]
-async fn main() -> Result<(), RouterError> {
-    let args = Args::parse();
-
-    // Pattern match configuration
-    let Args {
-        max_concurrent_requests,
-        max_best_of,
-        max_stop_sequences,
-        max_top_n_tokens,
-        max_input_tokens,
-        max_total_tokens,
-        waiting_served_ratio,
-        max_batch_prefill_tokens,
-        max_batch_total_tokens,
-        max_waiting_tokens,
-        max_batch_size,
-        hostname,
-        port,
-        master_shard_uds_path,
-        tokenizer_name,
-        tokenizer_config_path,
-        revision,
-        validation_workers,
-        json_output,
-        otlp_endpoint,
-        otlp_service_name,
-        cors_allow_origin,
-        ngrok,
-        ngrok_authtoken,
-        ngrok_edge,
-        messages_api_enabled,
-        disable_grammar_support,
-        max_client_batch_size,
-        command,
-    } = args;
-
-    let print_schema_command = match command {
-        Some(Commands::PrintSchema) => true,
-        None => {
-            // only init logging if we are not running the print schema command
-            init_logging(otlp_endpoint, otlp_service_name, json_output);
-            false
-        }
-    };
-
-    // Validate args
-    if max_input_tokens >= max_total_tokens {
-        return Err(RouterError::ArgumentValidation(
-            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
-        ));
-    }
-    if max_input_tokens as u32 > max_batch_prefill_tokens {
-        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
-    }
-
-    if validation_workers == 0 {
-        return Err(RouterError::ArgumentValidation(
-            "`validation_workers` must be > 0".to_string(),
-        ));
-    }
-
-    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
-        if max_batch_prefill_tokens > *max_batch_total_tokens {
-            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
-        }
-        if max_total_tokens as u32 > *max_batch_total_tokens {
-            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
-        }
-    }
-
-    // CORS allowed origins
-    // map to go inside the option and then map to parse from String to HeaderValue
-    // Finally, convert to AllowOrigin
-    let cors_allow_origin: Option<AllowOrigin> = cors_allow_origin.map(|cors_allow_origin| {
-        AllowOrigin::list(
-            cors_allow_origin
-                .iter()
-                .map(|origin| origin.parse::<HeaderValue>().unwrap()),
-        )
-    });
-
-    // Parse Huggingface hub token
-    let authorization_token = std::env::var("HF_TOKEN")
-        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
-        .ok();
-
-    // Tokenizer instance
-    // This will only be used to validate payloads
-    let local_path = Path::new(&tokenizer_name);
-
-    // Shared API builder initialization
-    let api_builder = || {
-        let mut builder = ApiBuilder::new()
-            .with_progress(false)
-            .with_token(authorization_token);
-
-        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
-            builder = builder.with_cache_dir(cache_dir.into());
-        }
-
-        builder
-    };
-
-    // Decide if we need to use the API based on the revision and local path
-    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
-
-    // Initialize API if needed
-    #[derive(Clone)]
-    enum Type {
-        Api(Api),
-        Cache(Cache),
-        None,
-    }
-    let api = if use_api {
-        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
-            let cache = Cache::default();
-            tracing::warn!("Offline mode active using cache defaults");
-            Type::Cache(cache)
-        } else {
-            tracing::info!("Using the Hugging Face API");
-            match api_builder().build() {
-                Ok(api) => Type::Api(api),
-                Err(_) => {
-                    tracing::warn!("Unable to build the Hugging Face API");
-                    Type::None
-                }
-            }
-        }
-    } else {
-        Type::None
-    };
-
-    // Load tokenizer and model info
-    let (
-        tokenizer_filename,
-        config_filename,
-        tokenizer_config_filename,
-        preprocessor_config_filename,
-        processor_config_filename,
-        model_info,
-    ) = match api {
-        Type::None => (
-            Some(local_path.join("tokenizer.json")),
-            Some(local_path.join("config.json")),
-            Some(local_path.join("tokenizer_config.json")),
-            Some(local_path.join("preprocessor_config.json")),
-            Some(local_path.join("processor_config.json")),
-            None,
-        ),
-        Type::Api(api) => {
-            let api_repo = api.repo(Repo::with_revision(
-                tokenizer_name.to_string(),
-                RepoType::Model,
-                revision.clone().unwrap_or_else(|| "main".to_string()),
-            ));
-
-            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
-                Ok(tokenizer_filename) => Some(tokenizer_filename),
-                Err(_) => get_base_tokenizer(&api, &api_repo).await,
-            };
-            let config_filename = api_repo.get("config.json").await.ok();
-            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
-            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
-            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
-
-            let model_info = if let Some(model_info) = get_model_info(&api_repo).await {
-                Some(model_info)
-            } else {
-                tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
-                None
-            };
-            (
-                tokenizer_filename,
-                config_filename,
-                tokenizer_config_filename,
-                preprocessor_config_filename,
-                processor_config_filename,
-                model_info,
-            )
-        }
-        Type::Cache(cache) => {
-            let repo = cache.repo(Repo::with_revision(
-                tokenizer_name.to_string(),
-                RepoType::Model,
-                revision.clone().unwrap_or_else(|| "main".to_string()),
-            ));
-            (
-                repo.get("tokenizer.json"),
-                repo.get("config.json"),
-                repo.get("tokenizer_config.json"),
-                repo.get("preprocessor_config.json"),
-                repo.get("processor_config.json"),
-                None,
-            )
-        }
-    };
-    let config: Option<Config> = config_filename.and_then(|filename| {
-        std::fs::read_to_string(filename)
-            .ok()
-            .as_ref()
-            .and_then(|c| {
-                let config: Result<Config, _> = serde_json::from_str(c);
-                if let Err(err) = &config {
-                    tracing::warn!("Could not parse config {err:?}");
-                }
-                config.ok()
-            })
-    });
-    let model_info = model_info.unwrap_or_else(|| HubModelInfo {
-        model_id: tokenizer_name.to_string(),
-        sha: None,
-        pipeline_tag: None,
-    });
-
-    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
-    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
-    {
-        HubTokenizerConfig::from_file(filename)
-    } else {
-        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
-    };
-    let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
-        tracing::warn!("Could not find tokenizer config locally and no API specified");
-        HubTokenizerConfig::default()
-    });
-
-    let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
-        let mut tokenizer = Tokenizer::from_file(filename).ok();
-        if let Some(tokenizer) = &mut tokenizer {
-            if let Some(class) = &tokenizer_config.tokenizer_class {
-                if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{
-                    if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) {
-                        tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205");
-                        tokenizer.with_post_processor(post_processor);
-                    }
-                }
-            }
-        }
-        tokenizer
-    });
-
-    let preprocessor_config =
-        preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);
-    let processor_config = processor_config_filename
-        .and_then(HubProcessorConfig::from_file)
-        .unwrap_or_default();
-
-    tracing::info!("Using config {config:?}");
-    if tokenizer.is_none() {
-        tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}");
-        tracing::warn!("Rust input length validation and truncation is disabled");
-    }
-
-    // if pipeline-tag == text-generation we default to return_full_text = true
-    let compat_return_full_text = match &model_info.pipeline_tag {
-        None => {
-            tracing::warn!("no pipeline tag found for model {tokenizer_name}");
-            true
-        }
-        Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation",
-    };
-
-    // Determine the server port based on the feature and environment variable.
-    let port = if cfg!(feature = "google") {
-        std::env::var("AIP_HTTP_PORT")
-            .map(|aip_http_port| aip_http_port.parse::<u16>().unwrap_or(port))
-            .unwrap_or(port)
-    } else {
-        port
-    };
-
-    let addr = match hostname.parse() {
-        Ok(ip) => SocketAddr::new(ip, port),
-        Err(_) => {
-            tracing::warn!("Invalid hostname, defaulting to 0.0.0.0");
-            SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port)
-        }
-    };
-
-    // Run server
-    server::run(
-        master_shard_uds_path,
-        model_info,
-        compat_return_full_text,
-        max_concurrent_requests,
-        max_best_of,
-        max_stop_sequences,
-        max_top_n_tokens,
-        max_input_tokens,
-        max_total_tokens,
-        waiting_served_ratio,
-        max_batch_prefill_tokens,
-        max_batch_total_tokens,
-        max_waiting_tokens,
-        max_batch_size,
-        tokenizer,
-        config,
-        validation_workers,
-        addr,
-        cors_allow_origin,
-        ngrok,
-        ngrok_authtoken,
-        ngrok_edge,
-        tokenizer_config,
-        preprocessor_config,
-        processor_config,
-        messages_api_enabled,
-        disable_grammar_support,
-        max_client_batch_size,
-        print_schema_command,
-    )
-    .await?;
-    Ok(())
-}
-
-/// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
-///     - otlp_endpoint is an optional URL to an Open Telemetry collector
-///     - otlp_service_name service name to appear in APM
-///     - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
-///     - LOG_FORMAT may be TEXT or JSON (default to TEXT)
-///     - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
-fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
-    let mut layers = Vec::new();
-
-    // STDOUT/STDERR layer
-    let ansi = std::env::var("LOG_COLORIZE") != Ok("1".to_string());
-    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_file(true)
-        .with_ansi(ansi)
-        .with_line_number(true);
-
-    let fmt_layer = match json_output {
-        true => fmt_layer.json().flatten_event(true).boxed(),
-        false => fmt_layer.boxed(),
-    };
-    layers.push(fmt_layer);
-
-    // OpenTelemetry tracing layer
-    if let Some(otlp_endpoint) = otlp_endpoint {
-        global::set_text_map_propagator(TraceContextPropagator::new());
-
-        let tracer = opentelemetry_otlp::new_pipeline()
-            .tracing()
-            .with_exporter(
-                opentelemetry_otlp::new_exporter()
-                    .tonic()
-                    .with_endpoint(otlp_endpoint),
-            )
-            .with_trace_config(
-                trace::config()
-                    .with_resource(Resource::new(vec![KeyValue::new(
-                        "service.name",
-                        otlp_service_name,
-                    )]))
-                    .with_sampler(Sampler::AlwaysOn),
-            )
-            .install_batch(opentelemetry::runtime::Tokio);
-
-        if let Ok(tracer) = tracer {
-            layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
-            init_tracing_opentelemetry::init_propagator().unwrap();
-        };
-    }
-
-    // Filter events with LOG_LEVEL
-    let varname = "LOG_LEVEL";
-    let env_filter = if let Ok(log_level) = std::env::var(varname) {
-        // Override to avoid simple logs to be spammed with tokio level informations
-        let log_level = match &log_level[..] {
-            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
-            "info" => "text_generation_launcher=info,text_generation_router=info",
-            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
-            log_level => log_level,
-        };
-        EnvFilter::builder()
-            .with_default_directive(LevelFilter::INFO.into())
-            .parse_lossy(log_level)
-    } else {
-        EnvFilter::new("info")
-    };
-
-    tracing_subscriber::registry()
-        .with(env_filter)
-        .with(layers)
-        .init();
-}
-
-/// get model info from the Huggingface Hub
-pub async fn get_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
-    let response = api.info_request().send().await.ok()?;
-
-    if response.status().is_success() {
-        let hub_model_info: HubModelInfo =
-            serde_json::from_str(&response.text().await.ok()?).ok()?;
-        if let Some(sha) = &hub_model_info.sha {
-            tracing::info!(
-                "Serving revision {sha} of model {}",
-                hub_model_info.model_id
-            );
-        }
-        Some(hub_model_info)
-    } else {
-        None
-    }
-}
-
-/// get base tokenizer
-pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option<PathBuf> {
-    let config_filename = api_repo.get("config.json").await.ok()?;
-
-    // Open the file in read-only mode with buffer.
-    let file = File::open(config_filename).ok()?;
-    let reader = BufReader::new(file);
-
-    // Read the JSON contents of the file as an instance of `User`.
-    let config: serde_json::Value = serde_json::from_reader(reader).ok()?;
-
-    if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") {
-        let api_base_repo = api.repo(Repo::with_revision(
-            base_model_id.to_string(),
-            RepoType::Model,
-            "main".to_string(),
-        ));
-
-        api_base_repo.get("tokenizer.json").await.ok()
-    } else {
-        None
-    }
-}
-
-/// get tokenizer_config from the Huggingface Hub
-pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
-    let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;
-
-    // Open the file in read-only mode with buffer.
-    let file = File::open(tokenizer_config_filename).ok()?;
-    let reader = BufReader::new(file);
-
-    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
-    let tokenizer_config: HubTokenizerConfig = serde_json::from_reader(reader)
-        .map_err(|e| {
-            tracing::warn!("Unable to parse tokenizer config: {}", e);
-            e
-        })
-        .ok()?;
-
-    Some(tokenizer_config)
-}
-
-/// Create a post_processor for the LlamaTokenizer
-pub fn create_post_processor(
-    tokenizer: &Tokenizer,
-    tokenizer_config: &HubTokenizerConfig,
-) -> Result<TemplateProcessing, tokenizers::processors::template::TemplateProcessingBuilderError> {
-    let add_bos_token = tokenizer_config.add_bos_token.unwrap_or(true);
-    let add_eos_token = tokenizer_config.add_eos_token.unwrap_or(false);
-
-    let bos_token = tokenizer_config.bos_token.as_ref();
-    let eos_token = tokenizer_config.eos_token.as_ref();
-
-    if add_bos_token && bos_token.is_none() {
-        panic!("add_bos_token = true but bos_token is None");
-    }
-
-    if add_eos_token && eos_token.is_none() {
-        panic!("add_eos_token = true but eos_token is None");
-    }
-
-    let mut single = Vec::new();
-    let mut pair = Vec::new();
-    let mut special_tokens = Vec::new();
-
-    if add_bos_token {
-        if let Some(bos) = bos_token {
-            let bos_token_id = tokenizer
-                .token_to_id(bos.as_str())
-                .expect("Should have found the bos token id");
-            special_tokens.push((bos.as_str(), bos_token_id));
-            single.push(format!("{}:0", bos.as_str()));
-            pair.push(format!("{}:0", bos.as_str()));
-        }
-    }
-
-    single.push("$A:0".to_string());
-    pair.push("$A:0".to_string());
-
-    if add_eos_token {
-        if let Some(eos) = eos_token {
-            let eos_token_id = tokenizer
-                .token_to_id(eos.as_str())
-                .expect("Should have found the eos token id");
-            special_tokens.push((eos.as_str(), eos_token_id));
-            single.push(format!("{}:0", eos.as_str()));
-            pair.push(format!("{}:0", eos.as_str()));
-        }
-    }
-
-    if add_bos_token {
-        if let Some(bos) = bos_token {
-            pair.push(format!("{}:1", bos.as_str()));
-        }
-    }
-
-    pair.push("$B:1".to_string());
-
-    if add_eos_token {
-        if let Some(eos) = eos_token {
-            pair.push(format!("{}:1", eos.as_str()));
-        }
-    }
-
-    let post_processor = TemplateProcessing::builder()
-        .try_single(single)?
-        .try_pair(pair)?
-        .special_tokens(special_tokens)
-        .build()?;
-
-    Ok(post_processor)
-}
-
-#[derive(Debug, Error)]
-enum RouterError {
-    #[error("Argument validation error: {0}")]
-    ArgumentValidation(String),
-    #[error("WebServer error: {0}")]
-    WebServer(#[from] server::WebServerError),
-    #[error("Tokio runtime failed to start: {0}")]
-    Tokio(#[from] std::io::Error),
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use text_generation_router::TokenizerConfigToken;
-
-    #[test]
-    fn test_create_post_processor() {
-        let tokenizer_config = HubTokenizerConfig {
-            add_bos_token: None,
-            add_eos_token: None,
-            bos_token: Some(TokenizerConfigToken::String("<s>".to_string())),
-            eos_token: Some(TokenizerConfigToken::String("</s>".to_string())),
-            chat_template: None,
-            tokenizer_class: None,
-            completion_template: None,
-        };
-
-        let tokenizer =
-            Tokenizer::from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", None).unwrap();
-        let post_processor = create_post_processor(&tokenizer, &tokenizer_config).unwrap();
-
-        let expected = TemplateProcessing::builder()
-            .try_single("<s>:0 $A:0")
-            .unwrap()
-            .try_pair("<s>:0 $A:0 <s>:1 $B:1")
-            .unwrap()
-            .special_tokens(vec![("<s>".to_string(), 1)])
-            .build()
-            .unwrap();
-
-        assert_eq!(post_processor, expected);
-    }
-}
diff --git a/router/src/sagemaker.rs b/router/src/sagemaker.rs
new file mode 100644
index 0000000000000000000000000000000000000000..750ef222bb51fe976bbdf63a05fa7139a3eab2a8
--- /dev/null
+++ b/router/src/sagemaker.rs
@@ -0,0 +1,82 @@
+use crate::infer::Infer;
+use crate::server::{chat_completions, compat_generate, completions, ComputeType};
+use crate::{
+    ChatCompletion, ChatCompletionChunk, ChatRequest, Chunk, CompatGenerateRequest,
+    CompletionFinal, CompletionRequest, ErrorResponse, GenerateResponse, Info, StreamResponse,
+};
+use axum::extract::Extension;
+use axum::http::StatusCode;
+use axum::response::Response;
+use axum::Json;
+use serde::{Deserialize, Serialize};
+use tracing::instrument;
+use utoipa::ToSchema;
+
+#[derive(Clone, Deserialize, ToSchema)]
+#[serde(untagged)]
+pub(crate) enum SagemakerRequest {
+    Generate(CompatGenerateRequest),
+    Chat(ChatRequest),
+    Completion(CompletionRequest),
+}
+
+// Used for OpenAPI specs
+#[allow(dead_code)]
+#[derive(Serialize, ToSchema)]
+#[serde(untagged)]
+pub(crate) enum SagemakerResponse {
+    Generate(GenerateResponse),
+    Chat(ChatCompletion),
+    Completion(CompletionFinal),
+}
+
+// Used for OpenAPI specs
+#[allow(dead_code)]
+#[derive(Serialize, ToSchema)]
+#[serde(untagged)]
+pub(crate) enum SagemakerStreamResponse {
+    Generate(StreamResponse),
+    Chat(ChatCompletionChunk),
+    Completion(Chunk),
+}
+
+/// Generate tokens from Sagemaker request
+#[utoipa::path(
+post,
+tag = "Text Generation Inference",
+path = "/invocations",
+request_body = SagemakerRequest,
+responses(
+(status = 200, description = "Generated Chat Completion",
+content(
+("application/json" = SagemakerResponse),
+("text/event-stream" = SagemakerStreamResponse),
+)),
+(status = 424, description = "Generation Error", body = ErrorResponse,
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
+(status = 429, description = "Model is overloaded", body = ErrorResponse,
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
+(status = 422, description = "Input validation error", body = ErrorResponse,
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
+(status = 500, description = "Incomplete generation", body = ErrorResponse,
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
+)
+)]
+#[instrument(skip_all)]
+pub(crate) async fn sagemaker_compatibility(
+    default_return_full_text: Extension<bool>,
+    infer: Extension<Infer>,
+    compute_type: Extension<ComputeType>,
+    info: Extension<Info>,
+    Json(req): Json<SagemakerRequest>,
+) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+    match req {
+        SagemakerRequest::Generate(req) => {
+            compat_generate(default_return_full_text, infer, compute_type, Json(req)).await
+        }
+        SagemakerRequest::Chat(req) => chat_completions(infer, compute_type, info, Json(req)).await,
+        SagemakerRequest::Completion(req) => {
+            completions(infer, compute_type, info, Json(req)).await
+        }
+    }
+}
diff --git a/router/src/server.rs b/router/src/server.rs
index db8b16ad0b1d02dff7ab87594871df30a5915aad..eb1d2544ee3241c2163587e7bb47ebc164307f65 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1,32 +1,37 @@
 /// HTTP Server logic
 use crate::config::Config;
-use crate::infer::v2::SchedulerV2;
-use crate::infer::v3::SchedulerV3;
-use crate::infer::{HealthCheck, Scheduler};
-use crate::infer::{Infer, InferError, InferResponse, InferStreamResponse, ToolGrammar};
+use crate::infer::tool_grammar::ToolGrammar;
+use crate::infer::{Backend, Infer, InferError, InferResponse, InferStreamResponse};
 #[cfg(feature = "kserve")]
 use crate::kserve::{
     kerve_server_metadata, kserve_health_live, kserve_health_ready, kserve_model_infer,
     kserve_model_metadata, kserve_model_metadata_ready,
 };
+use crate::sagemaker::{
+    sagemaker_compatibility, SagemakerRequest, SagemakerResponse, SagemakerStreamResponse,
+    __path_sagemaker_compatibility,
+};
 use crate::validation::ValidationError;
+use crate::vertex::vertex_compatibility;
+use crate::ChatTokenizeResponse;
 use crate::{
-    BestOfSequence, Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest,
-    GenerateResponse, GrammarType, HubModelInfo, HubProcessorConfig, HubTokenizerConfig, Info,
-    Message, PrefillToken, SimpleToken, StreamDetails, StreamResponse, Token, TokenizeResponse,
-    Usage, Validation,
+    usage_stats, BestOfSequence, Details, ErrorResponse, FinishReason, FunctionName,
+    GenerateParameters, GenerateRequest, GenerateResponse, GrammarType, HubModelInfo,
+    HubProcessorConfig, HubTokenizerConfig, Info, Message, MessageChunk, MessageContent,
+    OutputMessage, PrefillToken, SimpleToken, StreamDetails, StreamOptions, StreamResponse,
+    TextMessage, Token, TokenizeResponse, ToolCallDelta, ToolCallMessage, Url, Usage, Validation,
 };
 use crate::{
     ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
     ChatCompletionDelta, ChatCompletionLogprob, ChatCompletionLogprobs, ChatCompletionTopLogprob,
     ChatRequest, Chunk, CompatGenerateRequest, Completion, CompletionComplete, CompletionFinal,
-    CompletionRequest, CompletionType, DeltaToolCall, Function, Prompt, Tool, VertexRequest,
-    VertexResponse,
+    CompletionRequest, CompletionType, DeltaToolCall, Function, Prompt, Tool,
 };
-use crate::{FunctionDefinition, HubPreprocessorConfig, ToolCall, ToolType};
+use crate::{FunctionDefinition, HubPreprocessorConfig, ToolCall, ToolChoice, ToolType};
+use crate::{ModelInfo, ModelsInfo};
 use async_stream::__private::AsyncStream;
 use axum::extract::Extension;
-use axum::http::{HeaderMap, Method, StatusCode};
+use axum::http::{HeaderMap, HeaderValue, Method, StatusCode};
 use axum::response::sse::{Event, KeepAlive, Sse};
 use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
@@ -36,13 +41,18 @@ use futures::stream::StreamExt;
 use futures::stream::{FuturesOrdered, FuturesUnordered};
 use futures::Stream;
 use futures::TryStreamExt;
+use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
+use hf_hub::{Cache, Repo, RepoType};
+use http::header::AUTHORIZATION;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
+use pyo3::types::IntoPyDict;
+use regex::Regex;
 use serde_json::Value;
 use std::convert::Infallible;
-use std::net::SocketAddr;
-use std::sync::atomic::AtomicBool;
-use std::sync::Arc;
-use text_generation_client::{v2, v3, ClientError, ShardInfo};
+use std::fs::File;
+use std::io::BufReader;
+use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+use std::path::{Path, PathBuf};
 use thiserror::Error;
 use tokenizers::Tokenizer;
 use tokio::select;
@@ -77,7 +87,7 @@ example = json ! ({"error": "Incomplete generation"})),
 )
 )]
 #[instrument(skip(infer, req))]
-async fn compat_generate(
+pub(crate) async fn compat_generate(
     Extension(default_return_full_text): Extension<bool>,
     infer: Extension<Infer>,
     compute_type: Extension<ComputeType>,
@@ -112,6 +122,81 @@ async fn get_model_info(info: Extension<Info>) -> Json<Info> {
     Json(info.0)
 }
 
+#[utoipa::path(
+get,
+tag = "Text Generation Inference",
+path = "/v1/models",
+responses(
+(status = 200, description = "Served model info", body = ModelInfo),
+(status = 404, description = "Model not found", body = ErrorResponse),
+)
+)]
+#[instrument(skip(info))]
+/// Get model info
+async fn openai_get_model_info(info: Extension<Info>) -> Json<ModelsInfo> {
+    Json(ModelsInfo {
+        data: vec![ModelInfo {
+            id: info.0.model_id.clone(),
+            object: "model".to_string(),
+            created: 0, // TODO: determine how to get this
+            owned_by: info.0.model_id.clone(),
+        }],
+        ..Default::default()
+    })
+}
+
+#[utoipa::path(
+    post,
+    tag = "Text Generation Inference",
+    path = "/chat_tokenize",
+    request_body = ChatRequest,
+    responses((status = 200, description = "Templated and tokenized ChatRequest", body = ChatTokenizeResponse))
+)]
+async fn get_chat_tokenize(
+    Extension(infer): Extension<Infer>,
+    Json(chat): Json<ChatRequest>,
+) -> Result<(HeaderMap, Json<ChatTokenizeResponse>), (StatusCode, Json<ErrorResponse>)> {
+    metrics::counter!("tgi_request_count").increment(1);
+
+    let generate_request: GenerateRequest = chat.try_into_generate(&infer)?.0;
+    let input = generate_request.inputs.clone();
+    let encoding = infer.tokenize(generate_request).await?;
+    if let Some(encoding) = encoding {
+        let tokens: Vec<SimpleToken> = encoding
+            .get_ids()
+            .iter()
+            .zip(encoding.get_offsets())
+            .map(|(&id, &(start, stop))| {
+                let text = input
+                    .chars()
+                    .skip(start)
+                    .take(stop - start)
+                    .collect::<String>();
+                SimpleToken {
+                    id,
+                    text,
+                    start,
+                    stop,
+                }
+            })
+            .collect();
+
+        let resp = ChatTokenizeResponse {
+            tokenize_response: TokenizeResponse(tokens),
+            templated_text: input,
+        };
+        Ok((HeaderMap::new(), Json(resp)))
+    } else {
+        Err((
+            StatusCode::NOT_FOUND,
+            Json(ErrorResponse {
+                error: "No fast tokenizer or tokenizer.json for this model".to_string(),
+                error_type: "no fast tokenizer".to_string(),
+            }),
+        ))
+    }
+}
+
 #[utoipa::path(
 get,
 tag = "Text Generation Inference",
@@ -122,12 +207,10 @@ responses(
 example = json ! ({"error": "unhealthy", "error_type": "healthcheck"})),
 )
 )]
-#[instrument(skip(health))]
+#[instrument(skip(infer))]
 /// Health check method
-async fn health(
-    mut health: Extension<HealthCheck>,
-) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
-    match health.check().await {
+async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
+    match infer.health().await {
         true => Ok(()),
         false => Err((
             StatusCode::SERVICE_UNAVAILABLE,
@@ -185,10 +268,13 @@ pub(crate) async fn generate_internal(
     span: tracing::Span,
 ) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
     let start_time = Instant::now();
-    metrics::increment_counter!("tgi_request_count");
+    metrics::counter!("tgi_request_count").increment(1);
 
     // Do not long ultra long inputs, like image payloads.
-    tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
+    tracing::debug!(
+        "Input: {}",
+        &req.inputs.chars().take(1000).collect::<String>()
+    );
 
     let compute_characters = req.inputs.chars().count();
     let mut add_prompt = None;
@@ -301,25 +387,15 @@ pub(crate) async fn generate_internal(
     );
 
     // Metrics
-    metrics::increment_counter!("tgi_request_success");
-    metrics::histogram!("tgi_request_duration", total_time.as_secs_f64());
-    metrics::histogram!(
-        "tgi_request_validation_duration",
-        validation_time.as_secs_f64()
-    );
-    metrics::histogram!("tgi_request_queue_duration", queue_time.as_secs_f64());
-    metrics::histogram!(
-        "tgi_request_inference_duration",
-        inference_time.as_secs_f64()
-    );
-    metrics::histogram!(
-        "tgi_request_mean_time_per_token_duration",
-        time_per_token.as_secs_f64()
-    );
-    metrics::histogram!(
-        "tgi_request_generated_tokens",
-        response.generated_text.generated_tokens as f64
-    );
+    metrics::counter!("tgi_request_success").increment(1);
+    metrics::histogram!("tgi_request_duration").record(total_time.as_secs_f64());
+    metrics::histogram!("tgi_request_validation_duration").record(validation_time.as_secs_f64());
+    metrics::histogram!("tgi_request_queue_duration").record(queue_time.as_secs_f64());
+    metrics::histogram!("tgi_request_inference_duration").record(inference_time.as_secs_f64());
+    metrics::histogram!("tgi_request_mean_time_per_token_duration")
+        .record(time_per_token.as_secs_f64());
+    metrics::histogram!("tgi_request_generated_tokens")
+        .record(response.generated_text.generated_tokens as f64);
 
     // Send response
     let mut output_text = response.generated_text.text;
@@ -381,12 +457,20 @@ async fn generate_stream(
     Sse<impl Stream<Item = Result<Event, Infallible>>>,
 ) {
     let span = tracing::Span::current();
-    let on_message_callback = |stream_token: StreamResponse| {
-        let event = Event::default();
-        event.json_data(stream_token).unwrap()
-    };
     let (headers, response_stream) =
-        generate_stream_internal(infer, compute_type, Json(req), on_message_callback, span).await;
+        generate_stream_internal(infer, compute_type, Json(req), span).await;
+
+    let response_stream = async_stream::stream! {
+        let mut response_stream = Box::pin(response_stream);
+        while let Some(raw_event) = response_stream.next().await {
+            yield Ok(raw_event.map_or_else(Event::from, |token| {
+                Event::default()
+                    .json_data(token)
+                    .unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into())
+            }));
+        }
+    };
+
     let sse = Sse::new(response_stream).keep_alive(KeepAlive::default());
     (headers, sse)
 }
@@ -395,11 +479,13 @@ async fn generate_stream_internal(
     infer: Infer,
     ComputeType(compute_type): ComputeType,
     Json(req): Json<GenerateRequest>,
-    on_message_callback: impl Fn(StreamResponse) -> Event,
     span: tracing::Span,
-) -> (HeaderMap, impl Stream<Item = Result<Event, Infallible>>) {
+) -> (
+    HeaderMap,
+    impl Stream<Item = Result<StreamResponse, InferError>>,
+) {
     let start_time = Instant::now();
-    metrics::increment_counter!("tgi_request_count");
+    metrics::counter!("tgi_request_count").increment(1);
 
     tracing::debug!("Input: {}", req.inputs);
 
@@ -427,19 +513,20 @@ async fn generate_stream_internal(
         let best_of = req.parameters.best_of.unwrap_or(1);
         if best_of != 1 {
             let err = InferError::from(ValidationError::BestOfStream);
-            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+            metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
             tracing::error!("{err}");
-            yield Ok(Event::from(err));
+            yield Err(err);
         } else if req.parameters.decoder_input_details {
             let err = InferError::from(ValidationError::PrefillDetailsStream);
-            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+            metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
             tracing::error!("{err}");
-            yield Ok(Event::from(err));
+            yield Err(err);
         } else {
             match infer.generate_stream(req).instrument(info_span!(parent: &span, "async_stream")).await {
                 // Keep permit as long as generate_stream lives
-                Ok((_permit, _input_length, mut response_stream)) => {
+                Ok((_permit, input_length, response_stream)) => {
                     let mut index = 0;
+                    let mut response_stream = Box::pin(response_stream);
                     // Server-Sent Event stream
                     while let Some(response) = response_stream.next().await {
                         index += 1;
@@ -463,8 +550,7 @@ async fn generate_stream_internal(
                                             generated_text: None,
                                             details: None,
                                         };
-                                        let event = on_message_callback(stream_token);
-                                        yield Ok(event);
+                                        yield Ok(stream_token);
                                     }
                                     // Yield event for last token and compute timings
                                     InferStreamResponse::End {
@@ -480,6 +566,7 @@ async fn generate_stream_internal(
                                                 finish_reason: generated_text.finish_reason,
                                                 generated_tokens: generated_text.generated_tokens,
                                                 seed: generated_text.seed,
+                                                input_length,
                                             }),
                                             false => None,
                                         };
@@ -500,13 +587,13 @@ async fn generate_stream_internal(
                                         span.record("seed", format!("{:?}", generated_text.seed));
 
                                         // Metrics
-                                        metrics::increment_counter!("tgi_request_success");
-                                        metrics::histogram!("tgi_request_duration", total_time.as_secs_f64());
-                                        metrics::histogram!("tgi_request_validation_duration", validation_time.as_secs_f64());
-                                        metrics::histogram!("tgi_request_queue_duration", queue_time.as_secs_f64());
-                                        metrics::histogram!("tgi_request_inference_duration", inference_time.as_secs_f64());
-                                        metrics::histogram!("tgi_request_mean_time_per_token_duration", time_per_token.as_secs_f64());
-                                        metrics::histogram!("tgi_request_generated_tokens", generated_text.generated_tokens as f64);
+                                        metrics::counter!("tgi_request_success").increment(1);
+                                        metrics::histogram!("tgi_request_duration").record(total_time.as_secs_f64());
+                                        metrics::histogram!("tgi_request_validation_duration").record(validation_time.as_secs_f64());
+                                        metrics::histogram!("tgi_request_queue_duration").record(queue_time.as_secs_f64());
+                                        metrics::histogram!("tgi_request_inference_duration").record(inference_time.as_secs_f64());
+                                        metrics::histogram!("tgi_request_mean_time_per_token_duration").record(time_per_token.as_secs_f64());
+                                        metrics::histogram!("tgi_request_generated_tokens").record(generated_text.generated_tokens as f64);
 
                                         // StreamResponse
                                         end_reached = true;
@@ -527,9 +614,7 @@ async fn generate_stream_internal(
                                             details
                                         };
 
-
-                                        let event = on_message_callback(stream_token);
-                                        yield Ok(event);
+                                        yield Ok(stream_token);
                                         break;
                                     }
                                 }
@@ -537,7 +622,7 @@ async fn generate_stream_internal(
                             // yield error
                             Err(err) => {
                                 error = true;
-                                yield Ok(Event::from(err));
+                                yield Err(err);
                                 break;
                             }
                         }
@@ -546,16 +631,16 @@ async fn generate_stream_internal(
                 // yield error
                 Err(err) => {
                     error = true;
-                    yield Ok(Event::from(err));
+                    yield Err(err);
                 }
             }
             // Check if generation reached the end
             // Skip if we already sent an error
             if !end_reached && !error {
-                let err = InferError::IncompleteGeneration;
-                metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
+                let err = InferError::IncompleteGenerationStream;
+                metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
                 tracing::error!("{err}");
-                yield Ok(Event::from(err));
+                yield Err(err);
             }
         }
     };
@@ -572,8 +657,8 @@ request_body = CompletionRequest,
 responses(
 (status = 200, description = "Generated Chat Completion",
 content(
-("application/json" = Completion),
-("text/event-stream" = CompletionCompleteChunk),
+("application/json" = CompletionFinal),
+("text/event-stream" = Chunk),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
 example = json ! ({"error": "Request failed during generation"})),
@@ -597,16 +682,17 @@ time_per_token,
 seed,
 )
 )]
-async fn completions(
+pub(crate) async fn completions(
     Extension(infer): Extension<Infer>,
     Extension(compute_type): Extension<ComputeType>,
     Extension(info): Extension<Info>,
     Json(req): Json<CompletionRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
     let span = tracing::Span::current();
-    metrics::increment_counter!("tgi_request_count");
+    metrics::counter!("tgi_request_count").increment(1);
 
     let CompletionRequest {
+        model,
         max_tokens,
         seed,
         stop,
@@ -625,7 +711,7 @@ async fn completions(
 
     // if suffix is present throw an error
     if req.suffix.is_some() {
-        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+        metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
         return Err((
             StatusCode::UNPROCESSABLE_ENTITY,
             Json(ErrorResponse {
@@ -637,7 +723,7 @@ async fn completions(
     }
 
     if req.prompt.0.len() > info.max_client_batch_size {
-        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+        metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
         return Err((
             StatusCode::UNPROCESSABLE_ENTITY,
             Json(ErrorResponse {
@@ -656,6 +742,7 @@ async fn completions(
         .iter()
         .map(|prompt| GenerateRequest {
             inputs: prompt.to_string(),
+            add_special_tokens: true,
             parameters: GenerateParameters {
                 best_of: None,
                 temperature,
@@ -675,7 +762,7 @@ async fn completions(
                 seed,
                 top_n_tokens: None,
                 grammar: None,
-                ..Default::default()
+                adapter_id: model.as_ref().filter(|m| *m != "tgi").map(String::from),
             },
         })
         .collect();
@@ -696,50 +783,85 @@ async fn completions(
 
             // Create a future for each generate_stream_internal call.
             let generate_future = async move {
-                let on_message_callback = move |stream_token: StreamResponse| {
-                    let event = Event::default();
-
-                    let current_time = std::time::SystemTime::now()
-                        .duration_since(std::time::UNIX_EPOCH)
-                        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
-                        .as_secs();
-
-                    event
-                        .json_data(Completion::Chunk(Chunk {
-                            id: "".to_string(),
-                            created: current_time,
-
-                            choices: vec![CompletionComplete {
-                                finish_reason: "".to_string(),
-                                index: index as u32,
-                                logprobs: None,
-                                text: stream_token.token.text,
-                            }],
-
-                            model: model_id.clone(),
-                            system_fingerprint: system_fingerprint.clone(),
-                        }))
-                        .unwrap_or_else(|_e| Event::default())
-                };
-
                 let (header_tx, header_rx) = oneshot::channel();
                 let (sse_tx, sse_rx) = tokio::sync::mpsc::unbounded_channel();
 
                 tokio::spawn(async move {
-                    let (header_map, sse) = generate_stream_internal(
+                    let (headers, response_stream) = generate_stream_internal(
                         infer_clone.clone(),
                         compute_type_clone.clone(),
                         Json(generate_request),
-                        on_message_callback,
                         span_clone.clone(),
                     )
                     .await;
 
+                    let response_stream = async_stream::stream! {
+                        let mut response_stream = Box::pin(response_stream);
+
+                        while let Some(stream_token) = response_stream.next().await {
+                            match stream_token {
+                                Ok(stream_token) => {
+                                    let event = Event::default();
+
+                                    let current_time = std::time::SystemTime::now()
+                                        .duration_since(std::time::UNIX_EPOCH)
+                                        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+                                        .as_secs();
+
+                                    let message = match stream_token.details {
+                                        Some(details) => {
+                                            let completion_tokens = details.generated_tokens;
+                                            let prompt_tokens = details.input_length;
+                                            let total_tokens = prompt_tokens + completion_tokens;
+
+                                            Completion::Final(CompletionFinal {
+                                                id: String::new(),
+                                                created: current_time,
+                                                model: model_id.clone(),
+                                                system_fingerprint: system_fingerprint.clone(),
+                                                choices: vec![CompletionComplete {
+                                                    finish_reason: details.finish_reason.to_string(),
+                                                    index: index as u32,
+                                                    logprobs: None,
+                                                    text: stream_token.token.text,
+                                                }],
+                                                usage: Usage {
+                                                    prompt_tokens,
+                                                    completion_tokens,
+                                                    total_tokens,
+                                                },
+                                            })
+                                        }
+                                        None => Completion::Chunk(Chunk {
+                                            id: String::new(),
+                                            created: current_time,
+                                            choices: vec![CompletionComplete {
+                                                finish_reason: String::new(),
+                                                index: index as u32,
+                                                logprobs: None,
+                                                text: stream_token.token.text,
+                                            }],
+                                            model: model_id.clone(),
+                                            system_fingerprint: system_fingerprint.clone(),
+                                        }),
+                                    };
+
+                                    let event = event
+                                        .json_data(message)
+                                        .unwrap_or_else(|_e| Event::default());
+
+                                    yield Ok(event);
+                                }
+                                Err(err) => yield Ok(Event::from(err)),
+                            }
+                        }
+                    };
+
                     // send and dont wait for response
-                    let _ = header_tx.send(header_map);
+                    let _ = header_tx.send(headers);
 
                     // pin an emit messages to the sse_tx
-                    let mut sse = Box::pin(sse);
+                    let mut sse = Box::pin(response_stream);
                     while let Some(event) = sse.next().await {
                         if sse_tx.send(event).is_err() {
                             tracing::error!("Failed to send event. Receiver dropped.");
@@ -820,6 +942,10 @@ async fn completions(
             }
         };
 
+        let stream = stream.chain(futures::stream::once(async {
+            Ok(Event::default().data("[DONE]"))
+        }));
+
         let sse = Sse::new(stream).keep_alive(KeepAlive::default());
         Ok((headers, sse).into_response())
     } else {
@@ -922,7 +1048,7 @@ async fn completions(
                 total_tokens += details.prefill.len() as u32 + details.generated_tokens;
 
                 Ok(CompletionComplete {
-                    finish_reason: details.finish_reason.to_string(),
+                    finish_reason: details.finish_reason.format(true),
                     index: index as u32,
                     logprobs: None,
                     text: generation.generated_text,
@@ -968,6 +1094,84 @@ async fn completions(
     }
 }
 
+enum StreamState {
+    Buffering,
+    BufferTrailing,
+    Content { skip_close_quote: bool },
+}
+
+/// Convert a StreamResponse into an Event to be sent over SSE
+fn create_event_from_stream_token(
+    stream_token: &StreamResponse,
+    logprobs: bool,
+    stream_options: Option<StreamOptions>,
+    inner_using_tools: bool,
+    system_fingerprint: String,
+    model_id: String,
+) -> Event {
+    let event = Event::default();
+    let current_time = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+        .as_secs();
+
+    let logprobs = logprobs.then(|| {
+        ChatCompletionLogprobs::from((stream_token.token.clone(), stream_token.top_tokens.clone()))
+    });
+
+    // replace the content with the tool calls if grammar is present
+    let (content, tool_calls) = if inner_using_tools {
+        (None, Some(vec![stream_token.token.text.clone()]))
+    } else {
+        let content = if !stream_token.token.special {
+            Some(stream_token.token.text.clone())
+        } else {
+            None
+        };
+
+        (content, None)
+    };
+
+    let (usage, finish_reason) = match &stream_token.details {
+        Some(details) => {
+            let usage = if stream_options
+                .as_ref()
+                .map(|s| s.include_usage)
+                .unwrap_or(false)
+            {
+                let completion_tokens = details.generated_tokens;
+                let prompt_tokens = details.input_length;
+                let total_tokens = prompt_tokens + completion_tokens;
+                Some(Usage {
+                    completion_tokens,
+                    prompt_tokens,
+                    total_tokens,
+                })
+            } else {
+                None
+            };
+            (usage, Some(details.finish_reason.format(true)))
+        }
+        None => (None, None),
+    };
+
+    let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
+        model_id.clone(),
+        system_fingerprint.clone(),
+        content,
+        tool_calls,
+        current_time,
+        logprobs,
+        finish_reason,
+        usage,
+    ));
+
+    event.json_data(chat_complete).unwrap_or_else(|e| {
+        println!("Failed to serialize ChatCompletionChunk: {:?}", e);
+        Event::default()
+    })
+}
+
 /// Generate tokens
 #[utoipa::path(
 post,
@@ -1002,182 +1206,158 @@ time_per_token,
 seed,
 )
 )]
-async fn chat_completions(
+pub(crate) async fn chat_completions(
     Extension(infer): Extension<Infer>,
     Extension(compute_type): Extension<ComputeType>,
     Extension(info): Extension<Info>,
-    Json(req): Json<ChatRequest>,
+    Json(chat): Json<ChatRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
     let span = tracing::Span::current();
-    metrics::increment_counter!("tgi_request_count");
+    metrics::counter!("tgi_request_count").increment(1);
     let ChatRequest {
-        logprobs,
-        max_tokens,
-        messages,
-        presence_penalty,
-        seed,
-        stop,
         stream,
-        tools,
-        tool_choice,
-        tool_prompt,
-        temperature,
-        response_format,
+        stream_options,
+        logprobs,
         ..
-    } = req;
+    } = chat.clone();
+    let (generate_request, using_tools): (GenerateRequest, bool) =
+        chat.try_into_generate(&infer)?;
 
-    let repetition_penalty = presence_penalty.map(|x| x + 2.0);
-    let max_new_tokens = max_tokens.or(Some(100));
-    let logprobs = logprobs.unwrap_or(false);
-    let tool_prompt = tool_prompt.unwrap_or_default();
-    let stop = stop.unwrap_or_default();
-    // enable greedy only when temperature is 0
-    let (do_sample, temperature) = match temperature {
-        Some(temperature) if temperature == 0.0 => (false, None),
-        other => (true, other),
-    };
-
-    // response_format and tools are mutually exclusive
-    if response_format.is_some() && tools.as_ref().is_some() {
-        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
-        return Err((
-            StatusCode::UNPROCESSABLE_ENTITY,
-            Json(ErrorResponse {
-                error: "Grammar and tools are mutually exclusive".to_string(),
-                error_type: "grammar and tools".to_string(),
-            }),
-        ));
-    }
-
-    // extract tool grammar if present
-    let tool_grammar = match ToolGrammar::apply(tools, tool_choice) {
-        Ok(grammar) => grammar,
-        Err(err) => {
-            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
-            tracing::error!("{err}");
-            return Err((
-                StatusCode::UNPROCESSABLE_ENTITY,
-                Json(ErrorResponse {
-                    error: err.to_string(),
-                    error_type: err.error_type().to_string(),
-                }),
-            ));
-        }
-    };
-
-    // determine the appropriate arguments for apply_chat_template
-    let tools_grammar_prompt = tool_grammar
-        .as_ref()
-        .map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt));
-
-    let (tools_grammar_prompt, grammar) = match response_format {
-        Some(response_format) => (None, Some(response_format)),
-        None => (
-            tools_grammar_prompt.clone(),
-            tools_grammar_prompt.map(|(grammar, _)| grammar.clone()),
-        ),
-    };
-
-    // apply chat template to flatten the request into a single input
-    let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) {
-        Ok(inputs) => inputs,
-        Err(err) => {
-            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
-            tracing::error!("{err}");
-            return Err((
-                StatusCode::UNPROCESSABLE_ENTITY,
-                Json(ErrorResponse {
-                    error: err.to_string(),
-                    error_type: err.error_type().to_string(),
-                }),
-            ));
-        }
-    };
-
-    // build the request passing some parameters
-    let generate_request = GenerateRequest {
-        inputs: inputs.to_string(),
-        parameters: GenerateParameters {
-            best_of: None,
-            temperature,
-            repetition_penalty,
-            frequency_penalty: req.frequency_penalty,
-            top_k: None,
-            top_p: req.top_p,
-            typical_p: None,
-            do_sample,
-            max_new_tokens,
-            return_full_text: None,
-            stop,
-            truncate: None,
-            watermark: false,
-            details: true,
-            decoder_input_details: !stream,
-            seed,
-            top_n_tokens: req.top_logprobs,
-            grammar,
-            ..Default::default()
-        },
-    };
+    let logprobs = logprobs.unwrap_or_default();
 
     // static values that will be returned in all cases
     let model_id = info.model_id.clone();
     let system_fingerprint = format!("{}-{}", info.version, info.docker_label.unwrap_or("native"));
-
     // switch on stream
     if stream {
-        // pass this callback to the stream generation and build the required event structure
-        let on_message_callback = move |stream_token: StreamResponse| {
-            let event = Event::default();
-
-            let current_time = std::time::SystemTime::now()
-                .duration_since(std::time::UNIX_EPOCH)
-                .unwrap_or_else(|_| std::time::Duration::from_secs(0))
-                .as_secs();
-
-            let logprobs = logprobs.then(|| {
-                ChatCompletionLogprobs::from((stream_token.token.clone(), stream_token.top_tokens))
-            });
+        let (headers, response_stream) =
+            generate_stream_internal(infer, compute_type, Json(generate_request), span).await;
+
+        // regex to match any function name
+        let function_regex = match Regex::new(r#"\{"function":\{"_name":"([^"]+)""#) {
+            Ok(regex) => regex,
+            Err(e) => {
+                return Err((
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                    Json(ErrorResponse {
+                        error: format!("Failed to compile regex: {}", e),
+                        error_type: "regex".to_string(),
+                    }),
+                ))
+            }
+        };
 
-            // replace the content with the tool calls if grammar is present
-            let (content, tool_calls) = if tool_grammar.is_some() {
-                (None, Some(vec![stream_token.token.text]))
+        let response_stream = async_stream::stream! {
+            let mut response_stream = Box::pin(response_stream);
+            let mut buffer = Vec::new();
+            let mut json_buffer = String::new();
+            let mut state = if using_tools {
+                StreamState::Buffering
             } else {
-                let content = if !stream_token.token.special {
-                    Some(stream_token.token.text)
-                } else {
-                    None
-                };
-
-                (content, None)
+                StreamState::Content {
+                    skip_close_quote: false,
+                }
             };
+            let mut response_as_tool = using_tools;
+            while let Some(result) = response_stream.next().await {
+                if let Ok(stream_token) = result {
+                    let token_text = &stream_token.token.text.clone();
+                    match state {
+                        StreamState::Buffering => {
+                            json_buffer.push_str(&token_text.replace(" ", ""));
+                            buffer.push(stream_token);
+                            if let Some(captures) = function_regex.captures(&json_buffer) {
+                                let function_name = captures[1].to_string();
+                                if function_name == "no_tool" {
+                                    state = StreamState::BufferTrailing;
+                                    response_as_tool = false;
+                                    buffer.clear();
+                                    json_buffer.clear();
+                                } else {
+                                    state = StreamState::Content {
+                                        skip_close_quote: false,
+                                    };
+                                    // send all the buffered messages
+                                    for stream_token in &buffer {
+                                        let event = create_event_from_stream_token(
+                                            stream_token,
+                                            logprobs,
+                                            stream_options.clone(),
+                                            response_as_tool,
+                                            system_fingerprint.clone(),
+                                            model_id.clone(),
+                                        );
+                                        yield Ok::<Event, Infallible>(event);
+                                    }
+                                }
+                            }
+                        }
+                        // if we skipped sending the buffer we need to avoid sending the following json key and quotes
+                        StreamState::BufferTrailing => {
+                            let infix_text = "\"content\":\"";
+                            json_buffer.push_str(&token_text.replace(" ", ""));
+                            // keep capturing until we find the infix text
+                            match json_buffer.find(infix_text) {
+                                Some(content_key_index) => {
+                                    json_buffer =
+                                        json_buffer[content_key_index + infix_text.len()..].to_string();
+                                }
+                                None => {
+                                    continue;
+                                }
+                            }
+                            // if there is leftover text after removing the infix text, we need to send it
+                            if !json_buffer.is_empty() {
+                                let event = Event::default();
+                                let current_time = std::time::SystemTime::now()
+                                    .duration_since(std::time::UNIX_EPOCH)
+                                    .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+                                    .as_secs();
+                                let chat_complete =
+                                    CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
+                                        model_id.clone(),
+                                        system_fingerprint.clone(),
+                                        Some(json_buffer.clone()),
+                                        None,
+                                        current_time,
+                                        None,
+                                        None,
+                                        None,
+                                    ));
+                                yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| {
+                                    InferError::StreamSerializationError(e.to_string()).into()
+                                }));
+                            }
+                            // cleanup the buffers
+                            buffer.clear();
+                            json_buffer.clear();
+                            state = StreamState::Content {
+                                skip_close_quote: true,
+                            };
+                        }
+                        StreamState::Content { skip_close_quote } => {
+                            if skip_close_quote && token_text.contains('"') {
+                                break;
+                            }
 
-            event
-                .json_data(CompletionType::ChatCompletionChunk(
-                    ChatCompletionChunk::new(
-                        model_id.clone(),
-                        system_fingerprint.clone(),
-                        content,
-                        tool_calls,
-                        current_time,
-                        logprobs,
-                        stream_token.details.map(|d| d.finish_reason.to_string()),
-                    ),
-                ))
-                .unwrap_or_else(|e| {
-                    println!("Failed to serialize ChatCompletionChunk: {:?}", e);
-                    Event::default()
-                })
+                            // send the content
+                            let event = create_event_from_stream_token(
+                                &stream_token,
+                                logprobs,
+                                stream_options.clone(),
+                                response_as_tool,
+                                system_fingerprint.clone(),
+                                model_id.clone(),
+                            );
+
+                            yield Ok::<Event, Infallible>(event);
+                        }
+                    }
+                }
+            }
+            yield Ok::<Event, Infallible>(Event::default().data("[DONE]"));
         };
 
-        let (headers, response_stream) = generate_stream_internal(
-            infer,
-            compute_type,
-            Json(generate_request),
-            on_message_callback,
-            span,
-        )
-        .await;
         let sse = Sse::new(response_stream).keep_alive(KeepAlive::default());
         Ok((headers, sse).into_response())
     } else {
@@ -1189,43 +1369,57 @@ async fn chat_completions(
             .unwrap_or_else(|_| std::time::Duration::from_secs(0))
             .as_secs();
 
-        let (tool_calls, output) = if tool_grammar.is_some() {
-            // gen_text should be valid json
+        let (tool_calls, output) = if using_tools {
             let gen_text_value: Value =
                 serde_json::from_str(&generation.generated_text).map_err(|e| {
-                    (
-                        StatusCode::UNPROCESSABLE_ENTITY,
-                        Json(ErrorResponse {
-                            error: e.to_string(),
-                            error_type: "Input validation error".to_string(),
-                        }),
-                    )
+                    InferError::ToolError(format!(
+                        "Failed to parse generated text: {} {:?}",
+                        e, generation.generated_text
+                    ))
                 })?;
-            let tool_calls = vec![ToolCall {
-                id: "0".to_string(),
-                r#type: "function".to_string(),
-                function: FunctionDefinition {
-                    description: None,
-                    name: gen_text_value
-                        .get("function")
-                        .and_then(|f| f.get("_name"))
-                        .and_then(|name| name.as_str())
-                        .unwrap_or("default_function_name")
-                        .to_string(),
-                    // Serialize the JSON object obtained from "function" to an escaped JSON string
-                    arguments: gen_text_value
-                        .get("function")
-                        .map(|f| {
-                            let mut f_cloned = f.clone();
-                            if let Value::Object(ref mut props) = f_cloned {
-                                props.remove("_name");
-                            }
-                            f_cloned
-                        })
-                        .unwrap_or_default(),
-                },
-            }];
-            (Some(tool_calls), None)
+            let function = gen_text_value.get("function").ok_or(InferError::ToolError(
+                "No function found in generated text".to_string(),
+            ))?;
+
+            let name = function
+                .get("_name")
+                .and_then(Value::as_str)
+                .ok_or(InferError::ToolError(
+                    "No _name found in generated text".to_string(),
+                ))?
+                .to_string();
+
+            let mut arguments = function.clone();
+            if let Value::Object(ref mut props) = arguments {
+                props.remove("_name");
+            }
+            match name.as_str() {
+                "no_tool" => {
+                    // parse the content message
+                    let content_message = arguments
+                        .get("content")
+                        .and_then(Value::as_str)
+                        .ok_or_else(|| {
+                            InferError::ToolError(
+                                "No `content` found in generated text".to_string(),
+                            )
+                        })?
+                        .to_string();
+                    (None, Some(content_message))
+                }
+                _ => {
+                    let tool_calls = vec![ToolCall {
+                        id: "0".to_string(),
+                        r#type: "function".to_string(),
+                        function: FunctionDefinition {
+                            description: None,
+                            name,
+                            arguments,
+                        },
+                    }];
+                    (Some(tool_calls), None)
+                }
+            }
         } else {
             (None, Some(generation.generated_text))
         };
@@ -1245,99 +1439,6 @@ async fn chat_completions(
     }
 }
 
-/// Generate tokens from Vertex request
-#[utoipa::path(
-post,
-tag = "Text Generation Inference",
-path = "/vertex",
-request_body = VertexRequest,
-responses(
-(status = 200, description = "Generated Text", body = VertexResponse),
-(status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
-(status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
-(status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
-(status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
-)
-)]
-#[instrument(
-    skip_all,
-    fields(
-        total_time,
-        validation_time,
-        queue_time,
-        inference_time,
-        time_per_token,
-        seed,
-    )
-)]
-async fn vertex_compatibility(
-    Extension(infer): Extension<Infer>,
-    Extension(compute_type): Extension<ComputeType>,
-    Json(req): Json<VertexRequest>,
-) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
-    let span = tracing::Span::current();
-    metrics::increment_counter!("tgi_request_count");
-
-    // check that theres at least one instance
-    if req.instances.is_empty() {
-        return Err((
-            StatusCode::UNPROCESSABLE_ENTITY,
-            Json(ErrorResponse {
-                error: "Input validation error".to_string(),
-                error_type: "Input validation error".to_string(),
-            }),
-        ));
-    }
-
-    // Process all instances
-    let predictions = req
-        .instances
-        .iter()
-        .map(|instance| {
-            let generate_request = GenerateRequest {
-                inputs: instance.inputs.clone(),
-                parameters: GenerateParameters {
-                    do_sample: true,
-                    max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
-                    seed: instance.parameters.as_ref().and_then(|p| p.seed),
-                    details: true,
-                    decoder_input_details: true,
-                    ..Default::default()
-                },
-            };
-
-            async {
-                generate_internal(
-                    Extension(infer.clone()),
-                    compute_type.clone(),
-                    Json(generate_request),
-                    span.clone(),
-                )
-                .await
-                .map(|(_, Json(generation))| generation.generated_text)
-                .map_err(|_| {
-                    (
-                        StatusCode::INTERNAL_SERVER_ERROR,
-                        Json(ErrorResponse {
-                            error: "Incomplete generation".into(),
-                            error_type: "Incomplete generation".into(),
-                        }),
-                    )
-                })
-            }
-        })
-        .collect::<FuturesUnordered<_>>()
-        .try_collect::<Vec<_>>()
-        .await?;
-
-    let response = VertexResponse { predictions };
-    Ok((HeaderMap::new(), Json(response)).into_response())
-}
-
 /// Tokenize inputs
 #[utoipa::path(
 post,
@@ -1363,8 +1464,11 @@ async fn tokenize(
             .iter()
             .zip(encoding.get_offsets())
             .map(|(&id, &(start, stop))| {
-                let text: String =
-                    String::from_utf8_lossy(&input.as_bytes()[start..stop]).to_string();
+                let text = input
+                    .chars()
+                    .skip(start)
+                    .take(stop - start)
+                    .collect::<String>();
                 SimpleToken {
                     id,
                     text,
@@ -1399,252 +1503,474 @@ async fn metrics(prom_handle: Extension<PrometheusHandle>) -> String {
 #[derive(Clone, Debug)]
 pub(crate) struct ComputeType(String);
 
+// OpenAPI documentation
+#[derive(OpenApi)]
+#[openapi(
+paths(
+health,
+get_model_info,
+compat_generate,
+generate,
+generate_stream,
+chat_completions,
+completions,
+tokenize,
+metrics,
+openai_get_model_info,
+sagemaker_compatibility,
+),
+components(
+schemas(
+Info,
+CompatGenerateRequest,
+SagemakerRequest,
+GenerateRequest,
+GrammarType,
+ChatRequest,
+Message,
+MessageContent,
+MessageChunk,
+Url,
+FunctionName,
+OutputMessage,
+TextMessage,
+ToolCallMessage,
+ToolCallDelta,
+ChatCompletionComplete,
+ChatCompletionChoice,
+ChatCompletionDelta,
+ChatCompletionChunk,
+ChatCompletionLogprob,
+ChatCompletionLogprobs,
+ChatCompletionTopLogprob,
+ChatCompletion,
+CompletionRequest,
+CompletionComplete,
+SagemakerResponse,
+SagemakerStreamResponse,
+Chunk,
+Completion,
+CompletionFinal,
+Prompt,
+GenerateParameters,
+PrefillToken,
+Token,
+GenerateResponse,
+TokenizeResponse,
+SimpleToken,
+BestOfSequence,
+Details,
+FinishReason,
+StreamResponse,
+StreamDetails,
+ErrorResponse,
+GrammarType,
+Usage,
+StreamOptions,
+DeltaToolCall,
+ToolType,
+Tool,
+ToolCall,
+Function,
+FunctionDefinition,
+ToolChoice,
+ModelInfo,
+)
+),
+tags(
+(name = "Text Generation Inference", description = "Hugging Face Text Generation Inference API")
+),
+info(
+title = "Text Generation Inference",
+license(
+name = "Apache 2.0",
+url = "https://www.apache.org/licenses/LICENSE-2.0"
+)
+)
+)]
+pub struct ApiDoc;
+
+pub fn schema() -> ApiDoc {
+    ApiDoc
+}
+
 /// Serving method
 #[allow(clippy::too_many_arguments)]
 pub async fn run(
-    master_shard_uds_path: String,
-    model_info: HubModelInfo,
-    compat_return_full_text: bool,
+    backend: impl Backend + Send + Sync + 'static,
     max_concurrent_requests: usize,
     max_best_of: usize,
     max_stop_sequences: usize,
     max_top_n_tokens: u32,
     max_input_tokens: usize,
     max_total_tokens: usize,
-    waiting_served_ratio: f32,
-    max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: Option<u32>,
-    max_waiting_tokens: usize,
-    max_batch_size: Option<usize>,
-    tokenizer: Option<Tokenizer>,
-    config: Option<Config>,
     validation_workers: usize,
-    addr: SocketAddr,
-    allow_origin: Option<AllowOrigin>,
+    api_key: Option<String>,
+    tokenizer_name: String,
+    tokenizer_config_path: Option<String>,
+    revision: Option<String>,
+    trust_remote_code: bool,
+    hostname: String,
+    port: u16,
+    cors_allow_origin: Option<Vec<String>>,
     ngrok: bool,
     _ngrok_authtoken: Option<String>,
     _ngrok_edge: Option<String>,
-    tokenizer_config: HubTokenizerConfig,
-    preprocessor_config: Option<HubPreprocessorConfig>,
-    processor_config: HubProcessorConfig,
-    messages_api_enabled: bool,
-    grammar_support: bool,
+    disable_grammar_support: bool,
     max_client_batch_size: usize,
-    print_schema_command: bool,
+    usage_stats_level: usage_stats::UsageStatsLevel,
 ) -> Result<(), WebServerError> {
-    // OpenAPI documentation
-    #[derive(OpenApi)]
-    #[openapi(
-    paths(
-    health,
-    get_model_info,
-    compat_generate,
-    generate,
-    generate_stream,
-    chat_completions,
-    completions,
-    tokenize,
-    metrics,
-    ),
-    components(
-    schemas(
-    Info,
-    CompatGenerateRequest,
-    GenerateRequest,
-    GrammarType,
-    ChatRequest,
-    Message,
-    ChatCompletionComplete,
-    ChatCompletionChoice,
-    ChatCompletionDelta,
-    ChatCompletionChunk,
-    ChatCompletionLogprob,
-    ChatCompletionLogprobs,
-    ChatCompletionTopLogprob,
-    ChatCompletion,
-    CompletionRequest,
-    CompletionComplete,
-    Chunk,
-    Completion,
-    CompletionFinal,
-    Prompt,
-    GenerateParameters,
-    PrefillToken,
-    Token,
-    GenerateResponse,
-    TokenizeResponse,
-    SimpleToken,
-    BestOfSequence,
-    Details,
-    FinishReason,
-    StreamResponse,
-    StreamDetails,
-    ErrorResponse,
-    GrammarType,
-    Usage,
-    DeltaToolCall,
-    ToolType,
-    Tool,
-    ToolCall,
-    Function,
-    FunctionDefinition,
-    )
-    ),
-    tags(
-    (name = "Text Generation Inference", description = "Hugging Face Text Generation Inference API")
-    ),
-    info(
-    title = "Text Generation Inference",
-    license(
-    name = "Apache 2.0",
-    url = "https://www.apache.org/licenses/LICENSE-2.0"
-    )
-    )
-    )]
-    struct ApiDoc;
+    // CORS allowed origins
+    // map to go inside the option and then map to parse from String to HeaderValue
+    // Finally, convert to AllowOrigin
+    let allow_origin: Option<AllowOrigin> = cors_allow_origin.map(|cors_allow_origin| {
+        AllowOrigin::list(
+            cors_allow_origin
+                .iter()
+                .map(|origin| origin.parse::<HeaderValue>().unwrap()),
+        )
+    });
 
-    // Create state
-    if print_schema_command {
-        let api_doc = ApiDoc::openapi();
-        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
-        println!("{}", api_doc);
-        std::process::exit(0);
-    }
+    // Parse Huggingface hub token
+    let authorization_token = std::env::var("HF_TOKEN")
+        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+        .ok();
 
-    // Open connection, get model info and warmup
-    let (scheduler, health_ext, shard_info, max_batch_total_tokens): (
-        Arc<dyn Scheduler + Send + Sync>,
-        HealthCheck,
-        ShardInfo,
-        u32,
-    ) = {
-        // Helper function to check both v2 and v3
-        let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
-            match max_supported_batch_total_tokens {
-                // Older models do not support automatic max-batch-total-tokens
-                None => {
-                    let max_batch_total_tokens = max_batch_total_tokens.unwrap_or(
-                        16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)),
-                    );
-                    tracing::warn!("Model does not support automatic max batch total tokens");
-                    Ok(max_batch_total_tokens)
-                }
-                // Flash attention models return their max supported total tokens
-                Some(max_supported_batch_total_tokens) => {
-                    // Warn if user added his own max-batch-total-tokens as we will ignore it
-                    if max_batch_total_tokens.is_some() {
-                        tracing::warn!(
-                            "`--max-batch-total-tokens` is deprecated for Flash \
-                        Attention models."
-                        );
-                        tracing::warn!(
-                            "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
-                        );
-                    }
-                    if max_total_tokens as u32 > max_supported_batch_total_tokens {
-                        return Err(WebServerError::NotEnoughMemory(max_total_tokens));
-                    }
+    // Tokenizer instance
+    // This will only be used to validate payloads
+    let local_path = Path::new(&tokenizer_name);
+
+    // Shared API builder initialization
+    let api_builder = || {
+        let mut builder = ApiBuilder::new()
+            .with_progress(false)
+            .with_token(authorization_token);
+
+        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
+            builder = builder.with_cache_dir(cache_dir.into());
+        }
+
+        builder
+    };
+
+    // Decide if we need to use the API based on the revision and local path
+    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
 
-                    Ok(max_supported_batch_total_tokens)
+    // Initialize API if needed
+    #[derive(Clone)]
+    enum Type {
+        Api(Api),
+        Cache(Cache),
+        None,
+    }
+    let api = if use_api {
+        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
+            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
+                .map_err(|_| ())
+                .map(|cache_dir| Cache::new(cache_dir.into()))
+                .unwrap_or_else(|_| Cache::default());
+            tracing::warn!("Offline mode active using cache defaults");
+            Type::Cache(cache)
+        } else {
+            tracing::info!("Using the Hugging Face API");
+            match api_builder().build() {
+                Ok(api) => Type::Api(api),
+                Err(_) => {
+                    tracing::warn!("Unable to build the Hugging Face API");
+                    Type::None
                 }
             }
+        }
+    } else {
+        Type::None
+    };
+
+    // Load tokenizer and model info
+    let (
+        tokenizer_filename,
+        config_filename,
+        tokenizer_config_filename,
+        preprocessor_config_filename,
+        processor_config_filename,
+        model_info,
+    ) = match api {
+        Type::None => (
+            Some(local_path.join("tokenizer.json")),
+            Some(local_path.join("config.json")),
+            Some(local_path.join("tokenizer_config.json")),
+            Some(local_path.join("preprocessor_config.json")),
+            Some(local_path.join("processor_config.json")),
+            None,
+        ),
+        Type::Api(api) => {
+            let api_repo = api.repo(Repo::with_revision(
+                tokenizer_name.to_string(),
+                RepoType::Model,
+                revision.clone().unwrap_or_else(|| "main".to_string()),
+            ));
+
+            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
+                Ok(tokenizer_filename) => Some(tokenizer_filename),
+                Err(_) => get_base_tokenizer(&api, &api_repo).await,
+            };
+            let config_filename = api_repo.get("config.json").await.ok();
+            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
+            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
+            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
+
+            let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
+                Some(model_info)
+            } else {
+                tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
+                None
+            };
+            (
+                tokenizer_filename,
+                config_filename,
+                tokenizer_config_filename,
+                preprocessor_config_filename,
+                processor_config_filename,
+                model_info,
+            )
+        }
+        Type::Cache(cache) => {
+            let repo = cache.repo(Repo::with_revision(
+                tokenizer_name.to_string(),
+                RepoType::Model,
+                revision.clone().unwrap_or_else(|| "main".to_string()),
+            ));
+            (
+                repo.get("tokenizer.json"),
+                repo.get("config.json"),
+                repo.get("tokenizer_config.json"),
+                repo.get("preprocessor_config.json"),
+                repo.get("processor_config.json"),
+                None,
+            )
+        }
+    };
+
+    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
+    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
+    {
+        HubTokenizerConfig::from_file(filename)
+    } else {
+        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
+    };
+    let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
+        tracing::warn!("Could not find tokenizer config locally and no API specified");
+        HubTokenizerConfig::default()
+    });
+
+    let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
+        use pyo3::prelude::*;
+        let convert = pyo3::Python::with_gil(|py| -> PyResult<()> {
+            let transformers = py.import_bound("transformers")?;
+            let auto = transformers.getattr("AutoTokenizer")?;
+            let from_pretrained = auto.getattr("from_pretrained")?;
+            let args = (tokenizer_name.to_string(),);
+            let kwargs = [
+                (
+                    "revision",
+                    (revision.clone().unwrap_or_else(|| "main".to_string())).into_py(py),
+                ),
+                ("trust_remote_code", trust_remote_code.into_py(py)),
+            ]
+            .into_py_dict_bound(py);
+            let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
+            let save = tokenizer.getattr("save_pretrained")?;
+            let args = ("out".to_string(),);
+            save.call1(args)?;
+            Ok(())
+        })
+        .inspect_err(|err| {
+            tracing::error!("Failed to import python tokenizer {err}");
+        });
+        let filename = if convert.is_ok() {
+            // If we have correctly loaded and resaved with transformers
+            // We might have modified the tokenizer.json according to transformers
+            "out/tokenizer.json".into()
+        } else {
+            filename
         };
+        Tokenizer::from_file(filename).ok()
+    });
+
+    let config: Option<Config> = config_filename.and_then(|filename| {
+        std::fs::read_to_string(filename)
+            .ok()
+            .as_ref()
+            .and_then(|c| {
+                let config: Result<Config, _> = serde_json::from_str(c);
+                if let Err(err) = &config {
+                    tracing::warn!("Could not parse config {err:?}");
+                }
+                config.ok()
+            })
+    });
+    let model_info = model_info.unwrap_or_else(|| HubModelInfo {
+        model_id: tokenizer_name.to_string(),
+        sha: None,
+        pipeline_tag: None,
+    });
+
+    let processor_config = processor_config_filename
+        .and_then(HubProcessorConfig::from_file)
+        .unwrap_or_default();
+
+    let preprocessor_config: Option<HubPreprocessorConfig> =
+        preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);
+
+    tracing::info!("Using config {config:?}");
+    if tokenizer.is_none() {
+        tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}");
+        tracing::warn!("Rust input length validation and truncation is disabled");
+    }
 
-        let generation_health = Arc::new(AtomicBool::new(false));
-
-        match v3::ShardedClient::connect_uds(master_shard_uds_path.clone()).await {
-            Ok(mut sharded_client) => {
-                // server is running on v3
-                // Clear the cache; useful if the webserver rebooted
-                sharded_client
-                    .clear_cache(None)
-                    .await
-                    .map_err(WebServerError::Cache)?;
-                // Get info from the shard
-                let shard_info = sharded_client.info().await.map_err(WebServerError::Info)?;
-
-                // Warmup model
-                tracing::info!("Warming up model");
-                let max_batch_total_tokens = check_max_batch_total_tokens(
-                    sharded_client
-                        .warmup(
-                            max_input_tokens as u32,
-                            max_batch_prefill_tokens,
-                            max_total_tokens as u32,
-                            max_batch_size,
-                        )
-                        .await
-                        .map_err(WebServerError::Warmup)?,
-                )?;
-
-                let health_ext =
-                    HealthCheck::new(Arc::new(sharded_client.clone()), generation_health.clone());
-                let scheduler = Arc::new(SchedulerV3::new(
-                    sharded_client,
-                    waiting_served_ratio,
-                    max_batch_prefill_tokens,
-                    max_batch_total_tokens,
-                    max_waiting_tokens,
-                    max_batch_size,
-                    shard_info.requires_padding,
-                    shard_info.window_size,
-                    shard_info.speculate,
-                    generation_health,
-                ));
-                tracing::info!("Using scheduler V3");
-
-                (scheduler, health_ext, shard_info, max_batch_total_tokens)
+    // Only send usage stats when TGI is run in container and the function returns Some
+    let is_container = matches!(usage_stats::is_container(), Ok(true));
+    let user_agent = match (usage_stats_level, is_container) {
+        (usage_stats::UsageStatsLevel::On | usage_stats::UsageStatsLevel::NoStack, true) => {
+            let reduced_args = usage_stats::Args::new(
+                config.clone(),
+                tokenizer_config.tokenizer_class.clone(),
+                max_concurrent_requests,
+                max_best_of,
+                max_stop_sequences,
+                max_top_n_tokens,
+                max_input_tokens,
+                max_total_tokens,
+                // waiting_served_ratio,
+                // max_batch_prefill_tokens,
+                // max_batch_total_tokens,
+                // max_waiting_tokens,
+                // max_batch_size,
+                revision.clone(),
+                validation_workers,
+                disable_grammar_support,
+                max_client_batch_size,
+                usage_stats_level,
+            );
+            Some(usage_stats::UserAgent::new(reduced_args))
+        }
+        _ => None,
+    };
+
+    if let Some(ref ua) = user_agent {
+        let start_event =
+            usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start, None);
+        tokio::spawn(async move {
+            start_event.send().await;
+        });
+    };
+    let compat_return_full_text = match &model_info.pipeline_tag {
+        None => {
+            tracing::warn!("no pipeline tag found for model {tokenizer_name}");
+            true
+        }
+        Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation",
+    };
+    let result = start(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        api_key,
+        config,
+        (tokenizer, tokenizer_config),
+        (preprocessor_config, processor_config),
+        hostname,
+        port,
+        ngrok,
+        _ngrok_authtoken,
+        _ngrok_edge,
+        disable_grammar_support,
+        max_client_batch_size,
+        model_info,
+        compat_return_full_text,
+        allow_origin,
+    )
+    .await;
+
+    if let Some(ua) = user_agent {
+        match result {
+            Ok(_) => {
+                let stop_event = usage_stats::UsageStatsEvent::new(
+                    ua.clone(),
+                    usage_stats::EventType::Stop,
+                    None,
+                );
+                stop_event.send().await;
+                Ok(())
             }
-            Err(_) => {
-                let mut sharded_client = v2::ShardedClient::connect_uds(master_shard_uds_path)
-                    .await
-                    .map_err(WebServerError::Connection)?;
-
-                // server is running on v2
-                // Clear the cache; useful if the webserver rebooted
-                sharded_client
-                    .clear_cache(None)
-                    .await
-                    .map_err(WebServerError::Cache)?;
-                // Get info from the shard
-                let shard_info = sharded_client.info().await.map_err(WebServerError::Info)?;
-
-                // Warmup model
-                tracing::info!("Warming up model");
-                let max_batch_total_tokens = check_max_batch_total_tokens(
-                    sharded_client
-                        .warmup(
-                            max_input_tokens as u32,
-                            max_batch_prefill_tokens,
-                            max_total_tokens as u32,
-                            max_batch_size,
-                        )
-                        .await
-                        .map_err(WebServerError::Warmup)?,
-                )?;
-
-                let health_ext =
-                    HealthCheck::new(Arc::new(sharded_client.clone()), generation_health.clone());
-                let scheduler = Arc::new(SchedulerV2::new(
-                    sharded_client,
-                    waiting_served_ratio,
-                    max_batch_prefill_tokens,
-                    max_batch_total_tokens,
-                    max_waiting_tokens,
-                    max_batch_size,
-                    shard_info.requires_padding,
-                    shard_info.window_size,
-                    shard_info.speculate,
-                    generation_health,
-                ));
-                tracing::info!("Using scheduler V2");
-
-                (scheduler, health_ext, shard_info, max_batch_total_tokens)
+            Err(e) => {
+                let description = match usage_stats_level {
+                    usage_stats::UsageStatsLevel::On => Some(e.to_string()),
+                    usage_stats::UsageStatsLevel::NoStack => Some("unknow_error".to_string()),
+                    _ => None,
+                };
+                let event = usage_stats::UsageStatsEvent::new(
+                    ua.clone(),
+                    usage_stats::EventType::Error,
+                    description,
+                );
+                event.send().await;
+
+                Err(e)
             }
         }
+    } else {
+        result
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn start(
+    backend: impl Backend + Send + Sync + 'static,
+    max_concurrent_requests: usize,
+    max_best_of: usize,
+    max_stop_sequences: usize,
+    max_top_n_tokens: u32,
+    max_input_tokens: usize,
+    max_total_tokens: usize,
+    validation_workers: usize,
+    api_key: Option<String>,
+    config: Option<Config>,
+    (tokenizer, tokenizer_config): (Option<Tokenizer>, HubTokenizerConfig),
+    (preprocessor_config, processor_config): (Option<HubPreprocessorConfig>, HubProcessorConfig),
+    hostname: String,
+    port: u16,
+    ngrok: bool,
+    _ngrok_authtoken: Option<String>,
+    _ngrok_edge: Option<String>,
+    disable_grammar_support: bool,
+    max_client_batch_size: usize,
+    model_info: HubModelInfo,
+    compat_return_full_text: bool,
+    allow_origin: Option<AllowOrigin>,
+) -> Result<(), WebServerError> {
+    // Determine the server port based on the feature and environment variable.
+    let port = if cfg!(feature = "google") {
+        std::env::var("AIP_HTTP_PORT")
+            .map(|aip_http_port| aip_http_port.parse::<u16>().unwrap_or(port))
+            .unwrap_or(port)
+    } else {
+        port
+    };
+
+    let addr = match hostname.parse() {
+        Ok(ip) => SocketAddr::new(ip, port),
+        Err(_) => {
+            tracing::warn!("Invalid hostname, defaulting to 0.0.0.0");
+            SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port)
+        }
     };
-    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
 
+    // Create state
     let validation = Validation::new(
         validation_workers,
         tokenizer,
@@ -1655,11 +1981,11 @@ pub async fn run(
         max_top_n_tokens,
         max_input_tokens,
         max_total_tokens,
-        grammar_support,
+        disable_grammar_support,
     );
 
     let infer = Infer::new(
-        scheduler,
+        backend,
         validation,
         max_concurrent_requests,
         tokenizer_config,
@@ -1696,8 +2022,8 @@ pub async fn run(
     let batch_size_matcher = Matcher::Full(String::from("tgi_batch_next_size"));
     let batch_size_buckets: Vec<f64> = (0..1024).map(|x| (x + 1) as f64).collect();
     // Speculated tokens buckets
-    let skipped_matcher = Matcher::Full(String::from("tgi_request_skipped_tokens"));
-    let skipped_buckets: Vec<f64> = (0..shard_info.speculate + 1).map(|x| x as f64).collect();
+    // let skipped_matcher = Matcher::Full(String::from("tgi_request_skipped_tokens"));
+    // let skipped_buckets: Vec<f64> = (0..shard_info.speculate + 1).map(|x| x as f64).collect();
 
     // Prometheus handler
     let builder = PrometheusBuilder::new()
@@ -1710,12 +2036,134 @@ pub async fn run(
         .set_buckets_for_metric(max_new_tokens_matcher, &max_new_tokens_buckets)
         .unwrap()
         .set_buckets_for_metric(batch_size_matcher, &batch_size_buckets)
-        .unwrap()
-        .set_buckets_for_metric(skipped_matcher, &skipped_buckets)
         .unwrap();
-    let prom_handle = builder
-        .install_recorder()
-        .expect("failed to install metrics recorder");
+    // .set_buckets_for_metric(skipped_matcher, &skipped_buckets)
+    // .unwrap();
+    // See: https://github.com/metrics-rs/metrics/issues/467#issuecomment-2022755151
+    let (recorder, _) = builder
+        .build()
+        .expect("failed to build prometheus recorder");
+    let prom_handle = recorder.handle();
+    metrics::set_global_recorder(recorder).expect("Failed to set global recorder");
+
+    // Metrics descriptions
+    metrics::describe_counter!("tgi_request_success", "Number of successful requests");
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Request duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Request validation duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Request queue duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_inference_duration",
+        metrics::Unit::Seconds,
+        "Request inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_mean_time_per_token_duration",
+        metrics::Unit::Seconds,
+        "Mean time per token per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_generated_tokens",
+        metrics::Unit::Count,
+        "Generated tokens per request"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_count",
+        metrics::Unit::Count,
+        "Inference calls per method (prefill or decode)"
+    );
+    metrics::describe_counter!(
+        "tgi_request_count",
+        metrics::Unit::Count,
+        "Total number of requests"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_success",
+        metrics::Unit::Count,
+        "Number of successful inference calls per method (prefill or decode)"
+    );
+    metrics::describe_gauge!(
+        "tgi_batch_current_size",
+        metrics::Unit::Count,
+        "Current batch size"
+    );
+    metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size");
+    metrics::describe_gauge!(
+        "tgi_batch_current_max_tokens",
+        metrics::Unit::Count,
+        "Maximum tokens for the current batch"
+    );
+    metrics::describe_gauge!(
+        "tgi_batch_total_tokens",
+        metrics::Unit::Count,
+        "Maximum amount of tokens in total."
+    );
+    metrics::describe_histogram!(
+        "tgi_request_max_new_tokens",
+        metrics::Unit::Count,
+        "Maximum new tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_inference_duration",
+        metrics::Unit::Seconds,
+        "Batch inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_forward_duration",
+        metrics::Unit::Seconds,
+        "Batch forward duration per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_skipped_tokens",
+        metrics::Unit::Count,
+        "Speculated tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_filter_duration",
+        metrics::Unit::Seconds,
+        "Time spent filtering batches and sending generated tokens per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Time spent in the queue per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Time spent validating the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Total time spent processing the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_decode_duration",
+        metrics::Unit::Seconds,
+        "Time spent decoding a batch per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_input_length",
+        metrics::Unit::Count,
+        "Input token length per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_next_size",
+        metrics::Unit::Count,
+        "Batch size of the next batch"
+    );
 
     // CORS layer
     let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
@@ -1728,18 +2176,18 @@ pub async fn run(
     let info = Info {
         model_id: model_info.model_id,
         model_sha: model_info.sha,
-        model_dtype: shard_info.dtype,
-        model_device_type: shard_info.device_type,
+        // model_dtype: shard_info.dtype,
+        // model_device_type: shard_info.device_type,
         model_pipeline_tag: model_info.pipeline_tag,
         max_concurrent_requests,
         max_best_of,
         max_stop_sequences,
         max_input_tokens,
         max_total_tokens,
-        waiting_served_ratio,
-        max_batch_total_tokens,
-        max_waiting_tokens,
-        max_batch_size,
+        // waiting_served_ratio,
+        // max_batch_total_tokens,
+        // max_waiting_tokens,
+        // max_batch_size,
         validation_workers,
         max_client_batch_size,
         router: env!("CARGO_PKG_NAME"),
@@ -1753,7 +2201,8 @@ pub async fn run(
 
     #[cfg(feature = "google")]
     {
-        use crate::VertexInstance;
+        use crate::vertex::__path_vertex_compatibility;
+        use crate::vertex::{VertexInstance, VertexRequest, VertexResponse};
 
         #[derive(OpenApi)]
         #[openapi(
@@ -1805,26 +2254,48 @@ pub async fn run(
     let swagger_ui = SwaggerUi::new("/docs").url("/api-doc/openapi.json", doc);
 
     // Define base and health routes
-    let base_routes = Router::new()
+    let mut base_routes = Router::new()
         .route("/", post(compat_generate))
-        .route("/", get(health))
-        .route("/info", get(get_model_info))
         .route("/generate", post(generate))
         .route("/generate_stream", post(generate_stream))
         .route("/v1/chat/completions", post(chat_completions))
         .route("/v1/completions", post(completions))
         .route("/vertex", post(vertex_compatibility))
-        .route("/tokenize", post(tokenize))
+        .route("/invocations", post(sagemaker_compatibility))
+        .route("/tokenize", post(tokenize));
+
+    if let Some(api_key) = api_key {
+        let mut prefix = "Bearer ".to_string();
+        prefix.push_str(&api_key);
+
+        // Leak to allow FnMut
+        let api_key: &'static str = prefix.leak();
+
+        let auth = move |headers: HeaderMap,
+                         request: axum::extract::Request,
+                         next: axum::middleware::Next| async move {
+            match headers.get(AUTHORIZATION) {
+                Some(token) => match token.to_str() {
+                    Ok(token_str) if token_str.to_lowercase() == api_key.to_lowercase() => {
+                        let response = next.run(request).await;
+                        Ok(response)
+                    }
+                    _ => Err(StatusCode::UNAUTHORIZED),
+                },
+                None => Err(StatusCode::UNAUTHORIZED),
+            }
+        };
+
+        base_routes = base_routes.layer(axum::middleware::from_fn(auth))
+    }
+    let info_routes = Router::new()
+        .route("/", get(health))
+        .route("/chat_tokenize", post(get_chat_tokenize))
+        .route("/info", get(get_model_info))
         .route("/health", get(health))
         .route("/ping", get(health))
-        .route("/metrics", get(metrics));
-
-    // Conditional AWS Sagemaker route
-    let aws_sagemaker_route = if messages_api_enabled {
-        Router::new().route("/invocations", post(chat_completions)) // Use 'chat_completions' for OAI_ENABLED
-    } else {
-        Router::new().route("/invocations", post(compat_generate)) // Use 'compat_generate' otherwise
-    };
+        .route("/metrics", get(metrics))
+        .route("/v1/models", get(openai_get_model_info));
 
     let compute_type =
         ComputeType(std::env::var("COMPUTE_TYPE").unwrap_or("gpu+optimized".to_string()));
@@ -1833,7 +2304,7 @@ pub async fn run(
     let mut app = Router::new()
         .merge(swagger_ui)
         .merge(base_routes)
-        .merge(aws_sagemaker_route);
+        .merge(info_routes);
 
     #[cfg(feature = "google")]
     {
@@ -1873,7 +2344,6 @@ pub async fn run(
     // add layers after routes
     app = app
         .layer(Extension(info))
-        .layer(Extension(health_ext.clone()))
         .layer(Extension(compat_return_full_text))
         .layer(Extension(infer))
         .layer(Extension(compute_type))
@@ -1911,6 +2381,68 @@ pub async fn run(
     Ok(())
 }
 
+/// get model info from the Huggingface Hub
+pub async fn get_hub_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
+    let response = api.info_request().send().await.ok()?;
+
+    if response.status().is_success() {
+        let hub_model_info: HubModelInfo =
+            serde_json::from_str(&response.text().await.ok()?).ok()?;
+        if let Some(sha) = &hub_model_info.sha {
+            tracing::info!(
+                "Serving revision {sha} of model {}",
+                hub_model_info.model_id
+            );
+        }
+        Some(hub_model_info)
+    } else {
+        None
+    }
+}
+
+/// get base tokenizer
+pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option<PathBuf> {
+    let config_filename = api_repo.get("config.json").await.ok()?;
+
+    // Open the file in read-only mode with buffer.
+    let file = File::open(config_filename).ok()?;
+    let reader = BufReader::new(file);
+
+    // Read the JSON contents of the file as an instance of `User`.
+    let config: serde_json::Value = serde_json::from_reader(reader).ok()?;
+
+    if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") {
+        let api_base_repo = api.repo(Repo::with_revision(
+            base_model_id.to_string(),
+            RepoType::Model,
+            "main".to_string(),
+        ));
+
+        api_base_repo.get("tokenizer.json").await.ok()
+    } else {
+        None
+    }
+}
+
+/// get tokenizer_config from the Huggingface Hub
+pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
+    let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;
+
+    // Open the file in read-only mode with buffer.
+    let file = File::open(tokenizer_config_filename).ok()?;
+    let reader = BufReader::new(file);
+
+    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
+    let tokenizer_config: HubTokenizerConfig = serde_json::from_reader(reader)
+        .map_err(|e| {
+            tracing::warn!("Unable to parse tokenizer config: {}", e);
+            e
+        })
+        .ok()?;
+
+    Some(tokenizer_config)
+}
+
 /// Shutdown signal handler
 async fn shutdown_signal() {
     let ctrl_c = async {
@@ -1947,8 +2479,11 @@ impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
             InferError::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS,
             InferError::ValidationError(_) => StatusCode::UNPROCESSABLE_ENTITY,
             InferError::IncompleteGeneration => StatusCode::INTERNAL_SERVER_ERROR,
+            InferError::IncompleteGenerationStream => StatusCode::INTERNAL_SERVER_ERROR,
             InferError::TemplateError(_) => StatusCode::UNPROCESSABLE_ENTITY,
+            InferError::MissingTemplateVariable(_) => StatusCode::UNPROCESSABLE_ENTITY,
             InferError::ToolError(_) => StatusCode::UNPROCESSABLE_ENTITY,
+            InferError::StreamSerializationError(_) => StatusCode::INTERNAL_SERVER_ERROR,
         };
 
         (
@@ -1974,16 +2509,160 @@ impl From<InferError> for Event {
 
 #[derive(Debug, Error)]
 pub enum WebServerError {
-    #[error("Unable to connect to the Python model shards: {0}")]
-    Connection(ClientError),
-    #[error("Unable to clear the Python model shards cache: {0}")]
-    Cache(ClientError),
-    #[error("Unable to get the Python model shards info: {0}")]
-    Info(ClientError),
-    #[error("Unable to warmup the Python model shards: {0}")]
-    Warmup(ClientError),
-    #[error("Not enough memory to handle `max_total_tokens={0}`")]
-    NotEnoughMemory(usize),
     #[error("Axum error: {0}")]
     Axum(#[from] axum::BoxError),
 }
+
+type PreparedInput = (String, Option<GrammarType>, bool);
+
+pub(crate) fn prepare_chat_input(
+    infer: &Infer,
+    response_format: Option<GrammarType>,
+    tools: Option<Vec<Tool>>,
+    tool_choice: ToolChoice,
+    tool_prompt: &str,
+    guideline: Option<String>,
+    messages: Vec<Message>,
+) -> Result<PreparedInput, InferError> {
+    if response_format.is_some() && tools.is_some() {
+        return Err(InferError::ToolError(
+            "Grammar and tools are mutually exclusive".into(),
+        ));
+    }
+
+    // when response_format is set, tools are not included when applying the chat template to generate inputs
+    if let Some(format) = response_format {
+        let inputs = infer.apply_chat_template(guideline, messages, None)?;
+        return Ok((inputs, Some(format), false));
+    }
+
+    // when no response_format is set and tools are included, apply the chat template with the tools
+    // to generate inputs
+    if let Some(tools) = tools {
+        let (updated_tools, tool_schema) = ToolGrammar::apply(tools, tool_choice)?;
+
+        let grammar = tool_schema
+            .as_ref()
+            .map(|t| GrammarType::Json(serde_json::json!(t)));
+
+        let inputs: String = infer.apply_chat_template(
+            guideline,
+            messages,
+            Some((updated_tools, tool_prompt.into())),
+        )?;
+        return Ok((inputs, grammar, tool_schema.is_some()));
+    }
+
+    // if no response_format or tools are set simply apply the chat template to generate inputs
+    let inputs = infer.apply_chat_template(guideline, messages, None)?;
+    Ok((inputs, None, false))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ChatTemplateVersions;
+    use crate::HubTokenizerConfig;
+    use crate::TokenizerConfigToken;
+    use crate::Tool;
+
+    use serde_json::json;
+
+    #[test]
+    fn test_prepare_chat_input() {
+        // Mock Backend to avoid network requests
+        struct MockBackend;
+
+        impl Backend for MockBackend {
+            fn schedule(
+                &self,
+                _request: crate::validation::ValidGenerateRequest,
+            ) -> Result<
+                tokio_stream::wrappers::UnboundedReceiverStream<
+                    Result<InferStreamResponse, InferError>,
+                >,
+                InferError,
+            > {
+                unimplemented!("Never called in this test");
+            }
+            fn health<'a, 'async_trait>(
+                &'a self,
+                _current_health: bool,
+            ) -> core::pin::Pin<
+                Box<dyn core::future::Future<Output = bool> + core::marker::Send + 'async_trait>,
+            >
+            where
+                'a: 'async_trait,
+                Self: 'async_trait,
+            {
+                unimplemented!("Never called in this test");
+            }
+        }
+
+        let backend = MockBackend {};
+
+        let mut tokenizer_config = HubTokenizerConfig::default();
+
+        // mock tokenizer config values
+        tokenizer_config.bos_token = Some(TokenizerConfigToken::String("<s>".to_string()));
+        tokenizer_config.eos_token = Some(TokenizerConfigToken::String("</s>".to_string()));
+        tokenizer_config.chat_template = Some(
+            ChatTemplateVersions::Single("{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n".to_string())
+        );
+
+        let infer = Infer::new(
+            backend,
+            Validation::new(1, None, None, None, 1, 1, 1, 1, 1, false),
+            1,
+            tokenizer_config,
+            HubProcessorConfig::default(),
+        );
+        let response_format = None;
+        let tools = Some(vec![Tool {
+            r#type: "function".to_string(),
+            function: FunctionDefinition {
+                name: "get_current_weather".to_string(),
+                description: Some("Get the current weather".to_string()),
+                arguments: json!({
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA"
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The temperature unit to use. Infer this from the users location."
+                        }
+                    },
+                    "required": ["location", "format"]
+                }),
+            },
+        }]);
+        let tool_prompt = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.";
+        let guideline = None;
+        let messages = vec![Message {
+            name: None,
+            role: "user".to_string(),
+            content: MessageContent::SingleText(
+                "What is the weather like in New York?".to_string(),
+            ),
+        }];
+
+        let result = prepare_chat_input(
+            &infer,
+            response_format,
+            tools,
+            ToolChoice(None),
+            tool_prompt,
+            guideline,
+            messages,
+        );
+
+        assert!(result.is_ok());
+        let (inputs, _grammar, using_tools) = result.expect("Failed to prepare chat input");
+        assert_eq!(using_tools, true);
+        assert_eq!(inputs, "<s>[AVAILABLE_TOOLS] [{\"type\": \"function\", \"function\": {\"arguments\": {\"properties\":{\"format\":{\"description\":\"The temperature unit to use. Infer this from the users location.\",\"enum\":[\"celsius\",\"fahrenheit\"],\"type\":\"string\"},\"location\":{\"description\":\"The city and state, e.g. San Francisco, CA\",\"type\":\"string\"}},\"required\":[\"location\",\"format\"],\"type\":\"object\"}, \"description\": \"Get the current weather\", \"name\": \"get_current_weather\"}}, {\"type\": \"function\", \"function\": {\"arguments\": {\"properties\":{\"content\":{\"description\":\"The response content\",\"type\":\"string\"}},\"required\":[\"content\"],\"type\":\"object\"}, \"description\": \"Open ened response with no specific tool selected\", \"name\": \"no_tool\"}}][/AVAILABLE_TOOLS][INST] What is the weather like in New York?\n---\nGiven the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.[/INST]".to_string());
+    }
+}
diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs
new file mode 100644
index 0000000000000000000000000000000000000000..e9d98327ef6f239097211310b7cc142177e936f8
--- /dev/null
+++ b/router/src/usage_stats.rs
@@ -0,0 +1,357 @@
+use crate::config::Config;
+use clap::ValueEnum;
+use csv::ReaderBuilder;
+use reqwest::header::HeaderMap;
+use serde::Serialize;
+use std::{
+    fs::File,
+    io::{self, BufRead},
+    path::Path,
+    process::Command,
+    time::Duration,
+};
+use uuid::Uuid;
+
+const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";
+
+#[derive(Copy, Clone, Debug, Serialize, ValueEnum)]
+pub enum UsageStatsLevel {
+    On,
+    NoStack,
+    Off,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct UserAgent {
+    pub uid: String,
+    pub args: Args,
+    pub env: Env,
+}
+
+impl UserAgent {
+    pub fn new(reduced_args: Args) -> Self {
+        Self {
+            uid: Uuid::new_v4().to_string(),
+            args: reduced_args,
+            env: Env::new(),
+        }
+    }
+}
+
+#[derive(Serialize, Debug)]
+pub enum EventType {
+    Start,
+    Stop,
+    Error,
+}
+
+#[derive(Debug, Serialize)]
+pub struct UsageStatsEvent {
+    user_agent: UserAgent,
+    event_type: EventType,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    error_reason: Option<String>,
+}
+
+impl UsageStatsEvent {
+    pub fn new(user_agent: UserAgent, event_type: EventType, error_reason: Option<String>) -> Self {
+        Self {
+            user_agent,
+            event_type,
+            error_reason,
+        }
+    }
+    pub async fn send(&self) {
+        let mut headers = HeaderMap::new();
+        headers.insert("Content-Type", "application/json".parse().unwrap());
+        let body = serde_json::to_string(&self).unwrap();
+        let client = reqwest::Client::new();
+        let _ = client
+            .post(TELEMETRY_URL)
+            .headers(headers)
+            .body(body)
+            .timeout(Duration::from_secs(5))
+            .send()
+            .await;
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct Args {
+    model_config: Option<Config>,
+    tokenizer_class: Option<String>,
+    max_concurrent_requests: usize,
+    max_best_of: usize,
+    max_stop_sequences: usize,
+    max_top_n_tokens: u32,
+    max_input_tokens: usize,
+    max_total_tokens: usize,
+    // waiting_served_ratio: f32,
+    // max_batch_prefill_tokens: u32,
+    // max_batch_total_tokens: Option<u32>,
+    // max_waiting_tokens: usize,
+    // max_batch_size: Option<usize>,
+    revision: Option<String>,
+    validation_workers: usize,
+    disable_grammar_support: bool,
+    max_client_batch_size: usize,
+    usage_stats_level: UsageStatsLevel,
+}
+
+impl Args {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        model_config: Option<Config>,
+        tokenizer_class: Option<String>,
+        max_concurrent_requests: usize,
+        max_best_of: usize,
+        max_stop_sequences: usize,
+        max_top_n_tokens: u32,
+        max_input_tokens: usize,
+        max_total_tokens: usize,
+        // waiting_served_ratio: f32,
+        // max_batch_prefill_tokens: u32,
+        // max_batch_total_tokens: Option<u32>,
+        // max_waiting_tokens: usize,
+        // max_batch_size: Option<usize>,
+        revision: Option<String>,
+        validation_workers: usize,
+        disable_grammar_support: bool,
+        max_client_batch_size: usize,
+        usage_stats_level: UsageStatsLevel,
+    ) -> Self {
+        Self {
+            model_config,
+            tokenizer_class,
+            max_concurrent_requests,
+            max_best_of,
+            max_stop_sequences,
+            max_top_n_tokens,
+            max_input_tokens,
+            max_total_tokens,
+            // waiting_served_ratio,
+            // max_batch_prefill_tokens,
+            // max_batch_total_tokens,
+            // max_waiting_tokens,
+            // max_batch_size,
+            revision,
+            validation_workers,
+            disable_grammar_support,
+            max_client_batch_size,
+            usage_stats_level,
+        }
+    }
+}
+
+/// This is more or less a copy of the code from the `text-generation-launcher` crate to avoid a dependency
+#[derive(Serialize, Debug, Clone)]
+pub struct Env {
+    git_sha: &'static str,
+    docker_label: &'static str,
+    nvidia_info: Option<Vec<NvidiaSmiInfo>>,
+    xpu_info: Option<Vec<XpuSmiInfo>>,
+    system_env: SystemInfo,
+}
+
+#[derive(Debug, Serialize, Clone)]
+struct NvidiaSmiInfo {
+    name: String,
+    pci_bus_id: String,
+    driver_version: String,
+    pstate: String,
+    pcie_link_gen_max: String,
+    pcie_link_gen_current: String,
+    temperature_gpu: String,
+    utilization_gpu: String,
+    utilization_memory: String,
+    memory_total: String,
+    memory_free: String,
+    memory_used: String,
+    reset_status_reset_required: String,
+    reset_status_drain_and_reset_recommended: String,
+    compute_cap: String,
+    ecc_errors_corrected_volatile_total: String,
+    mig_mode_current: String,
+    power_draw_instant: String,
+    power_limit: String,
+}
+
+impl NvidiaSmiInfo {
+    fn new() -> Option<Vec<NvidiaSmiInfo>> {
+        let output = Command::new("nvidia-smi")
+            .args([
+                "--query-gpu=name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.gpucurrent,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,reset_status.reset_required,reset_status.drain_and_reset_recommended,compute_cap,ecc.errors.corrected.volatile.total,mig.mode.current,power.draw.instant,power.limit",
+                "--format=csv"
+            ])
+            .output()
+            .ok()?;
+
+        if !output.status.success() {
+            return None;
+        }
+
+        let stdout = String::from_utf8(output.stdout).ok()?;
+
+        let mut rdr = ReaderBuilder::new()
+            .has_headers(true)
+            .from_reader(stdout.as_bytes());
+
+        let mut infos = Vec::new();
+
+        for result in rdr.records() {
+            let record = result.ok()?;
+            infos.push(NvidiaSmiInfo {
+                name: record[0].to_string(),
+                pci_bus_id: record[1].to_string(),
+                driver_version: record[2].to_string(),
+                pstate: record[3].to_string(),
+                pcie_link_gen_max: record[4].to_string(),
+                pcie_link_gen_current: record[5].to_string(),
+                temperature_gpu: record[6].to_string(),
+                utilization_gpu: record[7].to_string(),
+                utilization_memory: record[8].to_string(),
+                memory_total: record[9].to_string(),
+                memory_free: record[10].to_string(),
+                memory_used: record[11].to_string(),
+                reset_status_reset_required: record[12].to_string(),
+                reset_status_drain_and_reset_recommended: record[13].to_string(),
+                compute_cap: record[14].to_string(),
+                ecc_errors_corrected_volatile_total: record[15].to_string(),
+                mig_mode_current: record[16].to_string(),
+                power_draw_instant: record[17].to_string(),
+                power_limit: record[18].to_string(),
+            });
+        }
+
+        Some(infos)
+    }
+}
+
+#[derive(Debug, Serialize, Clone)]
+struct XpuSmiInfo {
+    device_id: usize,
+    gpu_utilization: f32,
+    gpu_power: f32,
+    gpu_core_temperature: f32,
+    gpu_memory_bandwidth_utilization: f32,
+}
+
+impl XpuSmiInfo {
+    /// based on this https://github.com/intel/xpumanager/blob/master/doc/smi_user_guide.md#dump-the-device-statistics-in-csv-format
+    fn new() -> Option<Vec<XpuSmiInfo>> {
+        let output = Command::new("xpu-smi")
+            .args([
+                "dump", "-d", "-1", "-m",
+                "0,1,3,17", // Metrics IDs: GPU Utilization, GPU Power, GPU Core Temperature, GPU Memory Bandwidth Utilization
+                "-n", "1", "-j",
+            ])
+            .output()
+            .ok()?;
+
+        if !output.status.success() {
+            return None;
+        }
+
+        let stdout = String::from_utf8(output.stdout).ok()?;
+        let mut infos = Vec::new();
+
+        let json_data: serde_json::Value = match serde_json::from_str(&stdout) {
+            Ok(data) => data,
+            Err(_) => return None,
+        };
+
+        if let Some(metrics_data) = json_data.as_array() {
+            for entry in metrics_data {
+                let device_id = entry["deviceId"].as_u64()? as usize;
+                let gpu_utilization = entry["metrics"][0].as_f64()? as f32;
+                let gpu_power = entry["metrics"][1].as_f64()? as f32;
+                let gpu_core_temperature = entry["metrics"][2].as_f64()? as f32;
+                let gpu_memory_bandwidth_utilization = entry["metrics"][3].as_f64()? as f32;
+
+                infos.push(XpuSmiInfo {
+                    device_id,
+                    gpu_utilization,
+                    gpu_power,
+                    gpu_core_temperature,
+                    gpu_memory_bandwidth_utilization,
+                });
+            }
+        }
+
+        Some(infos)
+    }
+}
+
+#[derive(Serialize, Debug, Clone)]
+pub struct SystemInfo {
+    cpu_count: usize,
+    cpu_type: String,
+    total_memory: u64,
+    architecture: String,
+    platform: String,
+}
+
+impl SystemInfo {
+    fn new() -> Self {
+        let mut system = sysinfo::System::new_all();
+        system.refresh_all();
+
+        let cpu_count = system.cpus().len();
+        let cpu_type = system.cpus()[0].brand().to_string();
+        let total_memory = system.total_memory();
+        let architecture = std::env::consts::ARCH.to_string();
+        let platform = format!(
+            "{}-{}-{}",
+            std::env::consts::OS,
+            std::env::consts::FAMILY,
+            std::env::consts::ARCH
+        );
+        Self {
+            cpu_count,
+            cpu_type,
+            total_memory,
+            architecture,
+            platform,
+        }
+    }
+}
+
+impl Default for Env {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Env {
+    pub fn new() -> Self {
+        Self {
+            system_env: SystemInfo::new(),
+            nvidia_info: NvidiaSmiInfo::new(),
+            xpu_info: XpuSmiInfo::new(),
+            git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
+            docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
+        }
+    }
+}
+
+pub fn is_container() -> io::Result<bool> {
+    let path = Path::new("/proc/self/cgroup");
+    let file = File::open(path)?;
+    let reader = io::BufReader::new(file);
+
+    for line in reader.lines() {
+        let line = line?;
+        // Check for common container runtimes
+        if line.contains("/docker/")
+            || line.contains("/docker-")
+            || line.contains("/kubepods/")
+            || line.contains("/kubepods-")
+            || line.contains("containerd")
+            || line.contains("crio")
+            || line.contains("podman")
+        {
+            return Ok(true);
+        }
+    }
+    Ok(false)
+}
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 12cf2ab36789d1fd7b3dc741f404f694f068bbbd..85b4220bf3c6f9402cc7c1195e35bb250d418dbe 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -5,13 +5,13 @@ use crate::{
     GenerateParameters, GenerateRequest, GrammarType, HubPreprocessorConfig, Idefics2Preprocessor,
 };
 use base64::{engine::general_purpose::STANDARD, Engine};
-use image::{io::Reader as ImageReader, ImageFormat};
+use image::{ImageFormat, ImageReader};
 use jsonschema::{Draft, JSONSchema};
 use rand::{thread_rng, Rng};
 use serde_json::Value;
 use std::io::Cursor;
 use std::iter;
-use text_generation_client::{Chunk, Image, InputChunk};
+use std::sync::Arc;
 use thiserror::Error;
 use tokenizers::tokenizer::Tokenizer;
 use tokio::sync::mpsc;
@@ -95,8 +95,9 @@ impl Validation {
     pub async fn tokenize(
         &self,
         inputs: String,
+        add_special_tokens: bool,
         truncate: Option<usize>,
-    ) -> Result<Option<(tokenizers::Encoding, Vec<InputChunk>)>, ValidationError> {
+    ) -> Result<Option<(tokenizers::Encoding, Vec<Chunk>)>, ValidationError> {
         // If we have a fast tokenizer
         if let Some(sender) = &self.sender {
             // Create response channel
@@ -104,7 +105,11 @@ impl Validation {
             // Send request to the background validation task
             // Unwrap is safe here
             sender
-                .send(((inputs, truncate), response_sender, Span::current()))
+                .send((
+                    (inputs, add_special_tokens, truncate),
+                    response_sender,
+                    Span::current(),
+                ))
                 .unwrap();
 
             // Await on response channel
@@ -116,15 +121,20 @@ impl Validation {
         }
     }
 
+    #[allow(clippy::type_complexity)]
     #[instrument(skip(self, inputs))]
     async fn validate_input(
         &self,
         inputs: String,
+        add_special_tokens: bool,
         truncate: Option<usize>,
         max_new_tokens: Option<u32>,
-    ) -> Result<(Vec<InputChunk>, usize, u32), ValidationError> {
+    ) -> Result<(Vec<Chunk>, Option<Vec<u32>>, usize, u32), ValidationError> {
         // If we have a fast tokenizer
-        if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? {
+        if let Some((encoding, inputs)) = self
+            .tokenize(inputs.clone(), add_special_tokens, truncate)
+            .await?
+        {
             // Create response channel
             let input_length = if let Some(truncate) = truncate {
                 std::cmp::min(encoding.len(), truncate)
@@ -157,8 +167,11 @@ impl Validation {
                 ));
             }
 
-            metrics::histogram!("tgi_request_input_length", input_length as f64);
-            Ok((inputs, input_length, max_new_tokens))
+            let ids = encoding.get_ids();
+            let input_ids = ids[ids.len().saturating_sub(input_length)..].to_owned();
+
+            metrics::histogram!("tgi_request_input_length").record(input_length as f64);
+            Ok((inputs, Some(input_ids), input_length, max_new_tokens))
         }
         // Return inputs without validation
         else {
@@ -182,7 +195,8 @@ impl Validation {
             }
 
             Ok((
-                vec![Chunk::Text(inputs).into()],
+                vec![Chunk::Text(inputs)],
+                None,
                 input_length,
                 max_new_tokens,
             ))
@@ -319,8 +333,13 @@ impl Validation {
             .unwrap_or(Ok(None))?;
 
         // Validate inputs
-        let (inputs, input_length, max_new_tokens) = self
-            .validate_input(request.inputs, truncate, max_new_tokens)
+        let (inputs, input_ids, input_length, max_new_tokens) = self
+            .validate_input(
+                request.inputs,
+                request.add_special_tokens,
+                truncate,
+                max_new_tokens,
+            )
             .await?;
 
         // TODO: we should build the FSM here and pass the compiled FSM instead of the grammar
@@ -353,6 +372,14 @@ impl Validation {
                             .compile(&json)
                             .map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?;
 
+                        // The schema can be valid but lack properties.
+                        // We need properties for the grammar to be successfully parsed in Python.
+                        // Therefore, we must check and throw an error if properties are missing.
+                        json.get("properties")
+                            .ok_or(ValidationError::InvalidGrammar(
+                                "Grammar must have a 'properties' field".to_string(),
+                            ))?;
+
                         // Serialize json to string
                         ValidGrammar::Json(
                             serde_json::to_string(&json)
@@ -384,10 +411,12 @@ impl Validation {
             ignore_eos_token: false,
         };
 
-        metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64);
+        metrics::histogram!("tgi_request_max_new_tokens").record(max_new_tokens as f64);
 
         Ok(ValidGenerateRequest {
             inputs,
+            input_ids: input_ids.map(Arc::new),
+            add_special_tokens: request.add_special_tokens,
             decoder_input_details,
             input_length: input_length as u32,
             truncate: truncate.unwrap_or(self.max_input_length) as u32,
@@ -436,12 +465,15 @@ fn tokenizer_worker(
     mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
 ) {
     // Loop over requests
-    while let Some(((inputs, truncate), response_tx, parent_span)) = receiver.blocking_recv() {
+    while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
+        receiver.blocking_recv()
+    {
         parent_span.in_scope(|| {
             response_tx
                 .send(prepare_input(
                     inputs,
                     truncate,
+                    add_special_tokens,
                     &tokenizer,
                     config.as_ref(),
                     preprocessor_config.as_ref(),
@@ -535,6 +567,7 @@ fn image_tokens(
     use HubPreprocessorConfig::*;
     match config {
         Idefics => "<image>".to_string(),
+        Mllama => "<|image|>".to_string(),
         Idefics2(config) => {
             const FAKE: &str = "<fake_token_around_image>";
             const IMAGE: &str = "<image>";
@@ -578,14 +611,15 @@ fn image_tokens_fixup(config: &Config, text: String) -> String {
 fn prepare_input(
     inputs: String,
     _truncate: Option<usize>,
+    add_special_tokens: bool,
     tokenizer: &Tokenizer,
     config: Option<&Config>,
     preprocessor_config: Option<&HubPreprocessorConfig>,
-) -> Result<(tokenizers::Encoding, Vec<InputChunk>), ValidationError> {
+) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
     use Config::*;
     static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
     let (tokenizer_query, input_chunks) = match config {
-        Some(config @ (Idefics | Idefics2(_) | Paligemma(_) | LlavaNext(_))) => {
+        Some(config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_))) => {
             let mut input_chunks = Vec::new();
             let mut tokenizer_query = String::with_capacity(inputs.len());
             let mut start = 0;
@@ -593,16 +627,16 @@ fn prepare_input(
                 let chunk_start = chunk.start();
                 let chunk_end = chunk.end();
                 if chunk_start != start {
-                    input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()).into());
+                    input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()));
                     tokenizer_query.push_str(&inputs[start..chunk_start]);
                 }
                 let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?;
-                input_chunks.push(Chunk::Image(Image { data, mimetype }).into());
+                input_chunks.push(Chunk::Image(Image { data, mimetype }));
                 tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width));
                 start = chunk_end;
             }
             if start != inputs.len() {
-                input_chunks.push(Chunk::Text(inputs[start..].to_string()).into());
+                input_chunks.push(Chunk::Text(inputs[start..].to_string()));
                 tokenizer_query.push_str(&inputs[start..]);
             }
 
@@ -610,31 +644,64 @@ fn prepare_input(
 
             (tokenizer_query, input_chunks)
         }
-        _ => (inputs.clone(), vec![Chunk::Text(inputs).into()]),
+        _ => (inputs.clone(), vec![Chunk::Text(inputs)]),
     };
 
     // Get the number of tokens in the input
     let encoding = tokenizer
-        .encode(tokenizer_query, true)
+        .encode(tokenizer_query, add_special_tokens)
         .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
 
     Ok((encoding, input_chunks))
 }
 
 type TokenizerRequest = (
-    (String, Option<usize>),
-    oneshot::Sender<Result<(tokenizers::Encoding, Vec<InputChunk>), ValidationError>>,
+    (String, bool, Option<usize>),
+    oneshot::Sender<Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError>>,
     Span,
 );
 
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct Image {
+    pub data: Vec<u8>,
+    pub mimetype: String,
+}
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub enum Chunk {
+    Text(String),
+    Image(Image),
+}
+
+/// Convert input chunks to a stringly-typed input for backwards
+/// compat for backends that haven't implemented chunked inputs.
+pub trait ChunksToString {
+    /// Convert chunks to string.
+    fn chunks_to_string(&self) -> String;
+}
+
+impl ChunksToString for Vec<Chunk> {
+    fn chunks_to_string(&self) -> String {
+        let mut output = String::new();
+        self.iter().for_each(|c| match &c {
+            Chunk::Text(text) => output.push_str(text),
+            Chunk::Image(Image { data, mimetype }) => {
+                let encoded = STANDARD.encode(data);
+                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
+            }
+        });
+        output
+    }
+}
+
 #[derive(Debug, Clone)]
-pub(crate) enum ValidGrammar {
+pub enum ValidGrammar {
     Json(String),
     Regex(String),
 }
 
 #[derive(Debug, Clone)]
-pub(crate) struct ValidParameters {
+pub struct ValidParameters {
     /// / exponential scaling output probability distribution
     pub temperature: f32,
     /// / restricting to the k highest probability elements
@@ -658,7 +725,7 @@ pub(crate) struct ValidParameters {
 }
 
 #[derive(Debug, Clone)]
-pub(crate) struct ValidStoppingParameters {
+pub struct ValidStoppingParameters {
     /// / Maximum number of generated tokens
     pub max_new_tokens: u32,
     /// / Optional stopping sequences
@@ -669,10 +736,12 @@ pub(crate) struct ValidStoppingParameters {
 }
 
 #[derive(Debug, Clone)]
-pub(crate) struct ValidGenerateRequest {
-    pub inputs: Vec<InputChunk>,
+pub struct ValidGenerateRequest {
+    pub inputs: Vec<Chunk>,
+    pub input_ids: Option<Arc<Vec<u32>>>,
     pub input_length: u32,
     pub truncate: u32,
+    pub add_special_tokens: bool,
     pub decoder_input_details: bool,
     pub parameters: ValidParameters,
     pub stopping_parameters: ValidStoppingParameters,
@@ -742,6 +811,8 @@ pub enum ValidationError {
     InvalidImageContent(String),
     #[error("Could not fetch image: {0}")]
     FailedFetchImage(#[from] reqwest::Error),
+    #[error("{0} modality is not supported")]
+    UnsupportedModality(&'static str),
 }
 
 #[cfg(test)]
@@ -777,11 +848,11 @@ mod tests {
 
         let max_new_tokens = 10;
         match validation
-            .validate_input("Hello".to_string(), None, Some(max_new_tokens))
+            .validate_input("Hello".to_string(), true, None, Some(max_new_tokens))
             .await
         {
             // Err(ValidationError::MaxNewTokens(1, 10)) => (),
-            Ok((_s, 0, 10)) => (),
+            Ok((_s, _, 0, 10)) => (),
             r => panic!("Unexpected not max new tokens: {r:?}"),
         }
     }
@@ -812,7 +883,7 @@ mod tests {
 
         let max_new_tokens = 10;
         match validation
-            .validate_input("Hello".to_string(), None, Some(max_new_tokens))
+            .validate_input("Hello".to_string(), true, None, Some(max_new_tokens))
             .await
         {
             Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
@@ -846,6 +917,7 @@ mod tests {
         match validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
+                add_special_tokens: true,
                 parameters: GenerateParameters {
                     best_of: Some(2),
                     do_sample: false,
@@ -885,6 +957,7 @@ mod tests {
         match validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
+                add_special_tokens: true,
                 parameters: GenerateParameters {
                     top_p: Some(1.0),
                     max_new_tokens: Some(5),
@@ -900,6 +973,7 @@ mod tests {
         match validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
+                add_special_tokens: true,
                 parameters: GenerateParameters {
                     top_p: Some(0.99),
                     max_new_tokens: Some(5),
@@ -915,6 +989,7 @@ mod tests {
         let valid_request = validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
+                add_special_tokens: true,
                 parameters: GenerateParameters {
                     top_p: None,
                     max_new_tokens: Some(5),
@@ -953,6 +1028,7 @@ mod tests {
         match validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
+                add_special_tokens: true,
                 parameters: GenerateParameters {
                     top_n_tokens: Some(5),
                     max_new_tokens: Some(5),
@@ -968,6 +1044,7 @@ mod tests {
         validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
+                add_special_tokens: true,
                 parameters: GenerateParameters {
                     top_n_tokens: Some(4),
                     max_new_tokens: Some(5),
@@ -980,6 +1057,7 @@ mod tests {
         validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
+                add_special_tokens: true,
                 parameters: GenerateParameters {
                     top_n_tokens: Some(0),
                     max_new_tokens: Some(5),
@@ -992,6 +1070,7 @@ mod tests {
         let valid_request = validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
+                add_special_tokens: true,
                 parameters: GenerateParameters {
                     top_n_tokens: None,
                     max_new_tokens: Some(5),
@@ -1040,6 +1119,7 @@ mod tests {
         let chunks = match validation
             .tokenize(
                 format!("test![](data:image/gif;base64,{})", PIXEL_GIF),
+                true,
                 None,
             )
             .await
@@ -1099,6 +1179,7 @@ mod tests {
                     "test![](data:image/gif;base64,{})![](data:image/gif;base64,{})",
                     PIXEL_GIF, PIXEL_GIF
                 ),
+                true,
                 None,
             )
             .await
diff --git a/router/src/vertex.rs b/router/src/vertex.rs
new file mode 100644
index 0000000000000000000000000000000000000000..a532c9eca8d5cf6c0e8c4500994c59a58dabeddc
--- /dev/null
+++ b/router/src/vertex.rs
@@ -0,0 +1,184 @@
+use crate::infer::Infer;
+use crate::server::{generate_internal, ComputeType};
+use crate::{ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest};
+use axum::extract::Extension;
+use axum::http::{HeaderMap, StatusCode};
+use axum::response::{IntoResponse, Response};
+use axum::Json;
+use serde::{Deserialize, Serialize};
+use tracing::instrument;
+use utoipa::ToSchema;
+
+#[derive(Clone, Deserialize, ToSchema)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
+pub(crate) struct GenerateVertexInstance {
+    #[schema(example = "What is Deep Learning?")]
+    pub inputs: String,
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub parameters: Option<GenerateParameters>,
+}
+
+#[derive(Clone, Deserialize, ToSchema)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
+#[serde(untagged)]
+pub(crate) enum VertexInstance {
+    Generate(GenerateVertexInstance),
+    Chat(ChatRequest),
+}
+
+#[derive(Deserialize, ToSchema)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
+pub(crate) struct VertexRequest {
+    #[serde(rename = "instances")]
+    pub instances: Vec<VertexInstance>,
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize)]
+pub(crate) struct VertexResponse {
+    pub predictions: Vec<String>,
+}
+
+/// Generate tokens from Vertex request
+#[utoipa::path(
+post,
+tag = "Text Generation Inference",
+path = "/vertex",
+request_body = VertexRequest,
+responses(
+(status = 200, description = "Generated Text", body = VertexResponse),
+(status = 424, description = "Generation Error", body = ErrorResponse,
+example = json ! ({"error": "Request failed during generation"})),
+(status = 429, description = "Model is overloaded", body = ErrorResponse,
+example = json ! ({"error": "Model is overloaded"})),
+(status = 422, description = "Input validation error", body = ErrorResponse,
+example = json ! ({"error": "Input validation error"})),
+(status = 500, description = "Incomplete generation", body = ErrorResponse,
+example = json ! ({"error": "Incomplete generation"})),
+)
+)]
+#[instrument(
+    skip_all,
+    fields(
+        total_time,
+        validation_time,
+        queue_time,
+        inference_time,
+        time_per_token,
+        seed,
+    )
+)]
+pub(crate) async fn vertex_compatibility(
+    Extension(infer): Extension<Infer>,
+    Extension(compute_type): Extension<ComputeType>,
+    Json(req): Json<VertexRequest>,
+) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+    let span = tracing::Span::current();
+    metrics::counter!("tgi_request_count").increment(1);
+
+    // check that theres at least one instance
+    if req.instances.is_empty() {
+        return Err((
+            StatusCode::UNPROCESSABLE_ENTITY,
+            Json(ErrorResponse {
+                error: "Input validation error".to_string(),
+                error_type: "Input validation error".to_string(),
+            }),
+        ));
+    }
+
+    // Prepare futures for all instances
+    let mut futures = Vec::with_capacity(req.instances.len());
+
+    for instance in req.instances.into_iter() {
+        let generate_request = match instance {
+            VertexInstance::Generate(instance) => GenerateRequest {
+                inputs: instance.inputs.clone(),
+                add_special_tokens: true,
+                parameters: GenerateParameters {
+                    do_sample: true,
+                    max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
+                    seed: instance.parameters.as_ref().and_then(|p| p.seed),
+                    details: true,
+                    decoder_input_details: true,
+                    ..Default::default()
+                },
+            },
+            VertexInstance::Chat(instance) => {
+                let (generate_request, _using_tools): (GenerateRequest, bool) =
+                    instance.try_into_generate(&infer)?;
+                generate_request
+            }
+        };
+
+        let infer_clone = infer.clone();
+        let compute_type_clone = compute_type.clone();
+        let span_clone = span.clone();
+
+        futures.push(async move {
+            generate_internal(
+                Extension(infer_clone),
+                compute_type_clone,
+                Json(generate_request),
+                span_clone,
+            )
+            .await
+            .map(|(_, Json(generation))| generation.generated_text)
+            .map_err(|_| {
+                (
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                    Json(ErrorResponse {
+                        error: "Incomplete generation".into(),
+                        error_type: "Incomplete generation".into(),
+                    }),
+                )
+            })
+        });
+    }
+
+    // execute all futures in parallel, collect results, returning early if any error occurs
+    let results = futures::future::join_all(futures).await;
+    let predictions: Result<Vec<_>, _> = results.into_iter().collect();
+    let predictions = predictions?;
+
+    let response = VertexResponse { predictions };
+    Ok((HeaderMap::new(), Json(response)).into_response())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{Message, MessageContent};
+
+    #[test]
+    fn vertex_deserialization() {
+        let string = serde_json::json!({
+
+        "instances": [
+            {
+                "messages": [{"role": "user", "content": "What's Deep Learning?"}],
+                "max_tokens": 128,
+                "top_p": 0.95,
+                "temperature": 0.7
+            }
+        ]
+
+        });
+        let request: VertexRequest = serde_json::from_value(string).expect("Can deserialize");
+        assert_eq!(
+            request,
+            VertexRequest {
+                instances: vec![VertexInstance::Chat(ChatRequest {
+                    messages: vec![Message {
+                        role: "user".to_string(),
+                        content: MessageContent::SingleText("What's Deep Learning?".to_string()),
+                        name: None,
+                    },],
+                    max_tokens: Some(128),
+                    top_p: Some(0.95),
+                    temperature: Some(0.7),
+                    ..Default::default()
+                })]
+            }
+        );
+    }
+}
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 8c77896e9e0118e2d0aa08b22ee150295fa6c633..12d58532c4cdfa79671b419e8494b1786bd1918f 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
 # Released on: June 13, 2024
 # https://releases.rs/docs/1.79.0/
-channel = "1.79.0"
+channel = "1.80.1"
 components = ["rustfmt", "clippy"]
diff --git a/server/.gitignore b/server/.gitignore
index be0e7ca686e26d70acc0e20344acecc3413224e4..576746eec14ee5f1f3bbfc858aa171fb4d1a2584 100644
--- a/server/.gitignore
+++ b/server/.gitignore
@@ -158,7 +158,7 @@ transformers
 safetensors
 flash-attention/
 flash-attention-v2/
-# vllm/
+vllm/
 llm-awq/
 eetq/
 mamba/
diff --git a/server/Makefile b/server/Makefile
index 8dfb187ce3d68e2b231e2ec7db3901a9e4fa22a8..49031b4281f8364ed3d17aaa076cf630165754b6 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -1,17 +1,16 @@
+include Makefile-flash-att
+include Makefile-flash-att-v2
+include Makefile-vllm
 include Makefile-awq
 include Makefile-eetq
 include Makefile-selective-scan
 include Makefile-lorax-punica
+include Makefile-exllamav2
+include Makefile-flashinfer
 
 unit-tests:
 	pytest -s -vv -m "not private" tests
 
-install-exllama:
-	cd exllama_kernels && python setup.py install
-
-install-exllamav2:
-	cd exllamav2_kernels && python setup.py install
-
 gen-server:
 	# Compile protos
 	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
@@ -23,16 +22,19 @@ gen-server:
 
 install-server: gen-server
 	pip install pip --upgrade
-	pip install -r requirements_rocm.txt
-	# pip install -e ".[bnb, accelerate, quantize, peft, outlines]"
+	pip install -r requirements_cuda.txt
 	pip install -e ".[accelerate, quantize, peft, outlines]"
 
-install: install-server
+
+install: install-cuda
 	echo "Installed server"
 
 install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
+	pip install -e ".[bnb,marlin,moe]"
+	pip install nvidia-nccl-cu12==2.22.3
 
-install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
+# install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
+install-rocm: install-server
 
 run-dev:
 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
@@ -40,3 +42,4 @@ run-dev:
 export-requirements:
 	poetry export -o requirements_cuda.txt --without-hashes
 	poetry export -o requirements_rocm.txt --without-hashes
+	poetry export -o requirements_intel.txt --without-hashes
diff --git a/server/Makefile-exllamav2 b/server/Makefile-exllamav2
new file mode 100644
index 0000000000000000000000000000000000000000..d1a4f2e4dbcecfc8079b2642c7bcbe0e77289f57
--- /dev/null
+++ b/server/Makefile-exllamav2
@@ -0,0 +1,16 @@
+exllamav2_commit := v0.1.8
+
+build-exllamav2:
+	git clone https://github.com/turboderp/exllamav2.git exllamav2 && \
+	cd exllamav2 && git fetch && git checkout $(exllamav2_commit)  && \
+	git submodule update --init --recursive && \
+	pip install -r requirements.txt && \
+	python setup.py build
+
+install-exllamav2: build-exllamav2
+	cd exllamav2/ &&  \
+	python setup.py install
+
+
+# sudo apt-get install libgflags-dev
+# sudo apt-get install libgoogle-glog-dev
diff --git a/server/Makefile-flash-att b/server/Makefile-flash-att
new file mode 100644
index 0000000000000000000000000000000000000000..29e75bc4810c6ca4575a3e4fff134f3ec03f66b0
--- /dev/null
+++ b/server/Makefile-flash-att
@@ -0,0 +1,12 @@
+flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec
+
+build-flash-attention:
+	if [ ! -d 'flash-attention' ]; then \
+		pip install -U packaging ninja  --no-cache-dir && \
+		git clone https://github.com/HazyResearch/flash-attention.git; \
+	fi
+	cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
+	MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build
+
+install-flash-attention: build-flash-attention
+	cd flash-attention && git checkout $(flash_att_commit) && MAX_JOBS=8 python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2
new file mode 100644
index 0000000000000000000000000000000000000000..a9cdf7822709240fd2951fce6367465b9df1ba94
--- /dev/null
+++ b/server/Makefile-flash-att-v2
@@ -0,0 +1,21 @@
+flash_att_v2_commit_cuda := v2.6.1
+flash_att_v2_commit_rocm := 2092111b9f975b3347c652ff7fabd431130256c4
+
+build-flash-attention-v2-cuda:
+	pip install -U packaging wheel
+	pip install flash-attn==$(flash_att_v2_commit_cuda)
+
+install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
+	echo "Flash v2 installed"
+
+build-flash-attention-v2-rocm:
+	if [ ! -d 'flash-attention-v2' ]; then \
+		pip install -U packaging ninja  --no-cache-dir && \
+		git clone https://github.com/mht-sharma/flash-attention.git flash-attention-v2 && \
+		cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) && \
+		git submodule update --init --recursive && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
+	fi
+
+install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
+	cd flash-attention-v2 &&  \
+	GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
diff --git a/server/Makefile-flashinfer b/server/Makefile-flashinfer
new file mode 100644
index 0000000000000000000000000000000000000000..f0a27622a17ddd9a2ec6f7e1a51202b665269590
--- /dev/null
+++ b/server/Makefile-flashinfer
@@ -0,0 +1,2 @@
+install-flashinfer:
+	pip install flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu124/torch2.4
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
new file mode 100644
index 0000000000000000000000000000000000000000..18dcc4a0c53fb77c9c6df01d0bd2b37d7fabd4d1
--- /dev/null
+++ b/server/Makefile-vllm
@@ -0,0 +1,23 @@
+commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
+commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
+build-vllm-cuda:
+	if [ ! -d 'vllm' ]; then \
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/Narsil/vllm.git vllm; \
+	fi
+	cd vllm  && git fetch origin && git checkout $(commit_cuda) && python setup.py build
+
+install-vllm-cuda: build-vllm-cuda
+	cd vllm  && git fetch origin && git checkout $(commit_cuda) && pip install -e .
+
+build-vllm-rocm:
+	if [ ! -d 'vllm' ]; then \
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/mht-sharma/vllm.git vllm; \
+	fi
+	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
+
+install-vllm-rocm: build-vllm-rocm
+	cd vllm && git fetch && git checkout $(commit_rocm) && \
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .
diff --git a/server/custom_kernels/setup.py b/server/custom_kernels/setup.py
index 54b3ca69f8d2cd5a95fc05b9a1786cd98ab434b8..e0b83987c6dac14990e5bda8a00bd96e7e10edc4 100644
--- a/server/custom_kernels/setup.py
+++ b/server/custom_kernels/setup.py
@@ -1,22 +1,7 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-import torch
 
-# Compiler flags.
-CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
-# TODO(woosuk): Should we use -O3?
-NVCC_FLAGS = ["-O2", "-std=c++17","--gpu-max-threads-per-block=1024"]
-
-ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
-CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-
-extra_compile_args={
-    "cxx": CXX_FLAGS,
-    "nvcc": NVCC_FLAGS,
-}
-if not torch.version.hip:
-    extra_compile_args.append("-arch=compute_80")
+extra_compile_args = ["-std=c++17"]
 
 setup(
     name="custom_kernels",
diff --git a/server/exllama_kernels/exllama_kernels/cu_compat.cuh b/server/exllama_kernels/exllama_kernels/cu_compat.cuh
index 0e4c3f37d51fe4871167d798ed0d70431541168c..c5258813e147554e033eaf9a80c27dd694a50961 100644
--- a/server/exllama_kernels/exllama_kernels/cu_compat.cuh
+++ b/server/exllama_kernels/exllama_kernels/cu_compat.cuh
@@ -46,10 +46,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
 #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
 #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
 
-// __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
+__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
 
 #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
-// __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
+__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
 #endif
 
 #endif
diff --git a/server/exllama_kernels/setup.py b/server/exllama_kernels/setup.py
index 3fe81b8ec7d4013056760070a1ba08111f98b26f..cc307bf06884d2419d382d6962c9d04d68999a2a 100644
--- a/server/exllama_kernels/setup.py
+++ b/server/exllama_kernels/setup.py
@@ -1,18 +1,16 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 import torch
-# Compiler flags.
-CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
-# TODO(woosuk): Should we use -O3?
-NVCC_FLAGS = ["-O2", "-std=c++17","--gpu-max-threads-per-block=1024"]
 
-ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
-CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+extra_cuda_cflags = []
+extra_cflags = []
+if torch.version.hip:
+    extra_cflags = ["-DLEGACY_HIPBLAS_DIRECT=ON"]
+    extra_cuda_cflags = ["-DLEGACY_HIPBLAS_DIRECT=ON"]
 
-extra_compile_args={
-    "cxx": CXX_FLAGS,
-    "nvcc": NVCC_FLAGS,
+extra_compile_args = {
+    "cxx": extra_cflags,
+    "nvcc": extra_cuda_cflags,
 }
 
 setup(
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
index ad4e4cb3bb64e52fe7b17281c8f8b1a6a6d6af46..12684ff8b59fe5daa272dd218ccf2a0170e87333 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
@@ -44,10 +44,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
 #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
 #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
 
-// __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
+__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
 
 #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
-// __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
+__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
 #endif
 
 #endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
index 396b0022d0d9de2e0dfd0c92167acfd5a7def4ea..5b99f1ba8fdd766dd9c56180a3eea27abd823931 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
@@ -23,36 +23,6 @@
 #include "q_gemm_kernel.cuh"
 #include "q_gemm_kernel_gptq.cuh"
 
-#if defined(USE_ROCM)
-#include <hipblas/hipblas.h>
-__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
-                                                               hipblasOperation_t transA,
-                                                               hipblasOperation_t transB,
-                                                               int                m,
-                                                               int                n,
-                                                               int                k,
-                                                               const half*        alpha,
-                                                               const half*        AP,
-                                                               int                lda,
-                                                               const half*        BP,
-                                                               int                ldb,
-                                                               const half*        beta,
-                                                               half*              CP,
-                                                               int                ldc) {
-    return hipblasHgemm(handle, transA, transB, m, n, k,
-                        reinterpret_cast<const hipblasHalf *>(alpha),
-                        reinterpret_cast<const hipblasHalf *>(AP), lda,
-                        reinterpret_cast<const hipblasHalf *>(BP), ldb,
-                        reinterpret_cast<const hipblasHalf *>(beta),
-                        reinterpret_cast<hipblasHalf *>(CP), ldc);
-}
-#define hipblasHgemm __compat_hipblasHgemm
-
-// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
-#define rocblas_operation_none HIPBLAS_OP_N
-#define rocblas_hgemm __compat_hipblasHgemm
-#endif
-
 void gemm_half_q_half_cuda_part
 (
     const half* a,
diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp
deleted file mode 100644
index 19019724a469cad0d1bbea151cb423f034a099e9..0000000000000000000000000000000000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#include <torch/extension.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-#include <ATen/hip/HIPContext.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-#include "config.h"
-
-#include "hip/q_matrix.cuh"
-#include "hip/q_gemm.cuh"
-
-#include "cpp/util.h"
-
-// Some decluttering macros
-
-#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-
-
-// Quant matrix
-
-uintptr_t make_q_matrix
-(
-    torch::Tensor q_weight,
-    torch::Tensor q_perm,
-    torch::Tensor q_invperm,
-    torch::Tensor q_scale,
-    torch::Tensor q_scale_max,
-    torch::Tensor q_groups,
-    torch::Tensor q_group_map,
-    torch::Tensor gptq_qzeros,
-    torch::Tensor gptq_scales,
-    torch::Tensor gptq_g_idx,
-    torch::Tensor temp_dq
-)
-{
-    TORCH_CHECK_DTYPE(q_weight, kInt);
-    TORCH_CHECK_DTYPE_OPT(q_perm, kShort);
-    TORCH_CHECK_DTYPE_OPT(q_invperm, kShort);
-    TORCH_CHECK_DTYPE_OPT(q_scale, kInt);
-    TORCH_CHECK_DTYPE_OPT(q_scale_max, kHalf);
-    TORCH_CHECK_DTYPE_OPT(q_groups, kShort);
-    TORCH_CHECK_DTYPE_OPT(q_group_map, kShort);
-    TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt);
-    TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf);
-    TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt);
-
-    TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1);
-
-    int device = q_weight.device().index();
-    int width = q_weight.size(1);
-    int groups;
-    int height;
-
-    if (!q_scale.device().is_meta())
-    {
-        TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
-        TORCH_CHECK_SHAPES(q_scale_max, 0, q_scale, 0, 1);
-        groups = q_scale.size(0);
-        height = q_invperm.size(0);
-    }
-    else
-    {
-        TORCH_CHECK_SHAPES(q_weight, 1, gptq_qzeros, 1, 8);
-        TORCH_CHECK_SHAPES(q_weight, 1, gptq_scales, 1, 1);
-        groups = gptq_qzeros.size(0);
-        height = q_weight.size(0) * 8;
-    }
-
-    TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
-
-    QMatrix* m = new QMatrix
-    (
-        device,
-        height,
-        width,
-        groups,
-        (uint32_t*) q_weight.data_ptr(),
-        q_perm.device().is_meta() ? NULL : (uint16_t*) q_perm.data_ptr(),
-        q_invperm.device().is_meta() ? NULL : (uint16_t*) q_invperm.data_ptr(),
-        q_scale.device().is_meta() ? NULL : (uint32_t*) q_scale.data_ptr(),
-        q_scale_max.device().is_meta() ? NULL : (half*) q_scale_max.data_ptr(),
-        q_groups.device().is_meta() ? NULL : (uint16_t*) q_groups.data_ptr(),
-        q_group_map.device().is_meta() ? NULL : (uint16_t*) q_group_map.data_ptr(),
-        gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(),
-        gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(),
-        gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(),
-        (half*) temp_dq.data_ptr()
-    );
-
-    if (m->failed) throw std::runtime_error("CUDA out of memory");
-
-    return reinterpret_cast<uintptr_t> (m);
-}
-
-void gemm_half_q_half
-(
-    torch::Tensor a,
-    uintptr_t b,
-    torch::Tensor c,
-    bool force_cuda
-)
-{
-    QMatrix* qm = reinterpret_cast<QMatrix*> (b);
-
-    TORCH_CHECK_DTYPE(a, kHalf);
-    TORCH_CHECK_DTYPE(c, kHalf);
-    TORCH_CHECK_SHAPES(a, 0, c, 0, 1);
-    TORCH_CHECK(qm->height == a.size(1), "a and b have incompatible shapes")
-    TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")
-
-    const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(a));
-
-    gemm_half_q_half_cuda
-    (
-        at::cuda::getCurrentCUDABlasHandle(),
-        (const half*) a.data_ptr(),
-        qm,
-        (half*) c.data_ptr(),
-        c.size(0), // m
-        c.size(1), // n
-        a.size(1), // k
-        true,
-        NULL,
-        force_cuda
-    );
-}
-
-// Bindings
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("make_q_matrix", &make_q_matrix, "make_q_matrix");
-    m.def("gemm_half_q_half", &gemm_half_q_half, "gemm_half_q_half");
-}
diff --git a/server/exllamav2_kernels/setup.py b/server/exllamav2_kernels/setup.py
index baee29d89a547cdaa86676f4db012c5551225ec2..56ffa9733d4ac14e3f378f286b971bf35e8d0f71 100644
--- a/server/exllamav2_kernels/setup.py
+++ b/server/exllamav2_kernels/setup.py
@@ -2,18 +2,15 @@ from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 import torch
 
-# Compiler flags.
-CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
-# TODO(woosuk): Should we use -O3?
-NVCC_FLAGS = ["-O2", "-std=c++17","--gpu-max-threads-per-block=1024"]
+extra_cuda_cflags = ["-lineinfo", "-O3"]
+extra_cflags = []
+if torch.version.hip:
+    extra_cflags = ["-DLEGACY_HIPBLAS_DIRECT=ON"]
+    extra_cuda_cflags += ["-DHIPBLAS_USE_HIP_HALF", "-DLEGACY_HIPBLAS_DIRECT=ON"]
 
-ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
-CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-
-extra_compile_args={
-    "cxx": CXX_FLAGS,
-    "nvcc": NVCC_FLAGS,
+extra_compile_args = {
+    "cxx": extra_cflags,
+    "nvcc": extra_cuda_cflags,
 }
 
 setup(
diff --git a/server/marlin/COPYRIGHT b/server/marlin/COPYRIGHT
deleted file mode 100644
index 69f3b8e645f211f43422dfea4831a6baaa459a48..0000000000000000000000000000000000000000
--- a/server/marlin/COPYRIGHT
+++ /dev/null
@@ -1,20 +0,0 @@
-These kernels were vendored from VLLM. The Marlin kernels were developed
-by Elias Frantar and extended by Neural Magic.
-
----
-
-Copyright (C) Marlin.2024 Elias Frantar
-Modified by Neural Magic
-Copyright 2024 The vLLM team.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-         http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
diff --git a/server/marlin/marlin_kernels/__init__.pyi b/server/marlin/marlin_kernels/__init__.pyi
deleted file mode 100644
index 663984d011fb01a3cf082cb1aa6613351f5c96d3..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/__init__.pyi
+++ /dev/null
@@ -1,61 +0,0 @@
-import torch
-
-def gptq_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    g_idx: torch.Tensor,
-    perm: torch.Tensor,
-    workspace: torch.Tensor,
-    num_bits: int,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-    is_k_full: bool,
-) -> torch.Tensor:
-    """
-    Matrix multiplication using Marlin kernels. This is an extension of
-    `marlin_gemm` that supports converted GPTQ kernels.
-    """
-    ...
-
-def gptq_marlin_24_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_meta: torch.Tensor,
-    b_scales: torch.Tensor,
-    workspace: torch.Tensor,
-    num_bits: int,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-) -> torch.Tensor:
-    """
-    Matrix multiplication using Marlin kernels. This is an extension of
-    `marlin_gemm` that supports 2:4 sparsity.
-    """
-    ...
-
-def gptq_marlin_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    """Repack GPTQ parameters for Marlin kernels."""
-    ...
-
-def marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    workspace: torch.Tensor,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-) -> torch.Tensor:
-    """
-    Matrix multiplication using Marlin kernels.
-    """
-    ...
diff --git a/server/marlin/marlin_kernels/ext.cpp b/server/marlin/marlin_kernels/ext.cpp
deleted file mode 100644
index 37eccef66e0376aa8c9d97518c1aa811f32083e0..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/ext.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <torch/extension.h>
-
-#include "ext.hh"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("gptq_marlin_gemm", &gptq_marlin_gemm,
-        "Marlin gemm with GPTQ compatibility");
-  m.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm, "Marlin sparse 2:4 gemm");
-  m.def("gptq_marlin_repack", &gptq_marlin_repack,
-        "Repack GPTQ parameters for Marlin");
-  m.def("marlin_gemm", &marlin_gemm, "Marlin gemm");
-}
diff --git a/server/marlin/marlin_kernels/ext.hh b/server/marlin/marlin_kernels/ext.hh
deleted file mode 100644
index d1caaab7c48e983f4f28a86995e193e31c9f9031..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/ext.hh
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-
-#include <torch/library.h>
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-// No support for async
-#else
-
-torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
-                               torch::Tensor &b_scales, torch::Tensor &g_idx,
-                               torch::Tensor &perm, torch::Tensor &workspace,
-                               int64_t num_bits, int64_t size_m, int64_t size_n,
-                               int64_t size_k, bool is_k_full);
-
-torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
-                                  torch::Tensor &b_meta,
-                                  torch::Tensor &b_scales,
-                                  torch::Tensor &workspace, int64_t num_bits,
-                                  int64_t size_m, int64_t size_n,
-                                  int64_t size_k);
-
-torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
-                                 int64_t size_k, int64_t size_n,
-                                 int64_t num_bits);
-
-torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
-                          torch::Tensor &b_scales, torch::Tensor &workspace,
-                          int64_t size_m, int64_t size_n, int64_t size_k);
-
-#endif
diff --git a/server/marlin/marlin_kernels/gptq_marlin.cu b/server/marlin/marlin_kernels/gptq_marlin.cu
deleted file mode 100644
index 0beb9de14c6873c7a32c993efb5f2d68c3c809c5..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/gptq_marlin.cu
+++ /dev/null
@@ -1,1870 +0,0 @@
-/*
- * Modified by Neural Magic
- * Copyright (C) Marlin.2024 Elias Frantar
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Adapted from https://github.com/IST-DASLab/marlin
- */
-
-#include "gptq_marlin.cuh"
-#include "gptq_marlin_dtypes.cuh"
-
-#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
-  static_assert(std::is_same<scalar_t, half>::value ||          \
-                    std::is_same<scalar_t, nv_bfloat16>::value, \
-                "only float16 and bfloat16 is supported");
-
-template <typename T>
-inline std::string str(T x) {
-  return std::to_string(x);
-}
-
-namespace gptq_marlin {
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
-                                    int const* __restrict__ perm_int_ptr,
-                                    int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {}
-
-template <typename scalar_t,          // compute dtype, half or nv_float16
-          const int num_bits,         // number of bits used for weights
-          const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const bool has_act_order,    // whether act_order is enabled
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const int* __restrict__ g_idx,        // int32 group indices of shape k
-    int num_groups,  // number of scale groups per output channel
-    int prob_m,      // batch dimension m
-    int prob_n,      // output dimension n
-    int prob_k,      // reduction dimension k
-    int* locks       // extra global storage for barrier synchronization
-) {}
-
-}  // namespace gptq_marlin
-
-torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                               torch::Tensor& b_scales, torch::Tensor& g_idx,
-                               torch::Tensor& perm, torch::Tensor& workspace,
-                               int64_t num_bits, int64_t size_m, int64_t size_n,
-                               int64_t size_k, bool is_k_full) {
-  TORCH_CHECK_NOT_IMPLEMENTED(false,
-                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
-  return torch::empty({1, 1});
-}
-
-#else
-
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-template <typename scalar_t>
-__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
-                           const typename ScalarType<scalar_t>::FragB& frag_b,
-                           typename ScalarType<scalar_t>::FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
-  }
-}
-
-// Instruction for loading a full 16x16 matrix fragment of operand A from shared
-// memory, directly in tensor core layout.
-template <typename scalar_t>
-__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
-                             const void* smem_ptr) {
-  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
-               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
-               : "r"(smem));
-}
-
-// Lookup-table based 3-input logical operation; explicitly used for
-// dequantization as the compiler does not seem to automatically recognize it in
-// all cases.
-template <int lut>
-__device__ inline int lop3(int a, int b, int c) {
-  int res;
-  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-               : "=r"(res)
-               : "r"(a), "r"(b), "r"(c), "n"(lut));
-  return res;
-}
-
-// Constructs destination register by taking bytes from 2 sources (based on
-// mask)
-template <int start_byte, int mask>
-__device__ inline uint32_t prmt(uint32_t a) {
-  uint32_t res;
-  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
-               : "=r"(res)
-               : "r"(a), "n"(start_byte), "n"(mask));
-  return res;
-}
-
-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
-// values. We mostly follow the strategy in the link below, with some small
-// changes:
-// - FP16:
-// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
-// - BF16:
-// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
-template <typename scalar_t>
-__device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit(int q) {
-  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
-}
-
-template <>
-__device__ inline typename ScalarType<half>::FragB dequant_4bit<half>(int q) {
-  const int LO = 0x000f000f;
-  const int HI = 0x00f000f0;
-  const int EX = 0x64006400;
-  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
-  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
-  // directly into `SUB` and `ADD`.
-  const int SUB = 0x64086408;
-  const int MUL = 0x2c002c00;
-  const int ADD = 0xd480d480;
-  typename ScalarType<half>::FragB frag_b;
-  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
-                      *reinterpret_cast<const half2*>(&SUB));
-  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
-                      *reinterpret_cast<const half2*>(&MUL),
-                      *reinterpret_cast<const half2*>(&ADD));
-  return frag_b;
-}
-
-template <>
-__device__ inline typename ScalarType<nv_bfloat16>::FragB
-dequant_4bit<nv_bfloat16>(int q) {
-  static constexpr uint32_t MASK = 0x000f000f;
-  static constexpr uint32_t EX = 0x43004300;
-
-  // Guarantee that the `(a & b) | c` operations are LOP3s.
-
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
-  q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
-
-  typename ScalarType<nv_bfloat16>::FragB frag_b;
-  static constexpr uint32_t MUL = 0x3F803F80;
-  static constexpr uint32_t ADD = 0xC308C308;
-
-  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
-                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
-                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
-  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
-                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
-                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
-  return frag_b;
-}
-
-// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
-// bf16 Reference:
-// - FP16:
-// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
-// - BF16:
-// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
-template <typename scalar_t>
-__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
-  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
-}
-
-template <>
-__device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
-  static constexpr uint32_t mask_for_elt_01 = 0x5250;
-  static constexpr uint32_t mask_for_elt_23 = 0x5351;
-  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
-
-  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
-  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
-
-  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
-
-  typename ScalarType<half>::FragB frag_b;
-  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
-                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
-  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
-                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
-  return frag_b;
-}
-
-template <>
-__device__ inline typename ScalarType<nv_bfloat16>::FragB
-dequant_8bit<nv_bfloat16>(int q) {
-  typename ScalarType<nv_bfloat16>::FragB frag_b;
-
-  float fp32_intermediates[4];
-  uint32_t* fp32_intermediates_casted =
-      reinterpret_cast<uint32_t*>(fp32_intermediates);
-
-  static constexpr uint32_t fp32_base = 0x4B000000;
-  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
-  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
-  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
-  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
-
-  fp32_intermediates[0] -= 8388736.f;
-  fp32_intermediates[1] -= 8388736.f;
-  fp32_intermediates[2] -= 8388736.f;
-  fp32_intermediates[3] -= 8388736.f;
-
-  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
-  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
-                                   fp32_intermediates_casted[1], 0x7632);
-  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
-                                   fp32_intermediates_casted[3], 0x7632);
-
-  return frag_b;
-}
-
-// Multiply dequantized values by the corresponding quantization scale; used
-// only for grouped quantization.
-template <typename scalar_t>
-__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
-                             typename ScalarType<scalar_t>::FragS& frag_s,
-                             int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
-  frag_b[0] = __hmul2(frag_b[0], s);
-  frag_b[1] = __hmul2(frag_b[1], s);
-}
-
-// Same as above, but for act_order (each K is multiplied individually)
-template <typename scalar_t>
-__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::FragS& frag_s_1,
-                              typename ScalarType<scalar_t>::FragS& frag_s_2,
-                              typename ScalarType<scalar_t>::FragS& frag_s_3,
-                              typename ScalarType<scalar_t>::FragS& frag_s_4,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s_val_1_2;
-  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
-  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
-
-  scalar_t2 s_val_3_4;
-  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
-  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
-
-  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
-  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
-}
-
-// Given 2 floats multiply by 2 scales (halves)
-template <typename scalar_t>
-__device__ inline void scale_float(float* c,
-                                   typename ScalarType<scalar_t>::FragS& s) {
-  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
-  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
-  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
-}
-
-// Wait until barrier reaches `count`, then lock for current threadblock.
-__device__ inline void barrier_acquire(int* lock, int count) {
-  if (threadIdx.x == 0) {
-    int state = -1;
-    do
-      // Guarantee that subsequent writes by this threadblock will be visible
-      // globally.
-      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
-                   : "=r"(state)
-                   : "l"(lock));
-    while (state != count);
-  }
-  __syncthreads();
-}
-
-// Release barrier and increment visitation count.
-__device__ inline void barrier_release(int* lock, bool reset = false) {
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    if (reset) {
-      lock[0] = 0;
-      return;
-    }
-    int val = 1;
-    // Make sure that all writes since acquiring this barrier are visible
-    // globally, while releasing the barrier.
-    asm volatile("fence.acq_rel.gpu;\n");
-    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
-                 :
-                 : "l"(lock), "r"(val));
-  }
-}
-
-// For a given "a" of size [M,K] performs a permutation of the K columns based
-// on the given "perm" indices.
-__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
-                                    int const* __restrict__ perm_int_ptr,
-                                    int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {
-  int start_row = block_rows * blockIdx.x;
-  int finish_row = start_row + block_rows;
-  if (finish_row > size_m) {
-    finish_row = size_m;
-  }
-  int cur_block_rows = finish_row - start_row;
-
-  int row_stride = size_k * sizeof(half) / 16;
-
-  auto permute_row = [&](int row) {
-    int iters = size_k / default_threads;
-    int rest = size_k % default_threads;
-
-    int offset = row * row_stride;
-
-    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
-    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
-
-    int base_k = 0;
-
-    for (int i = 0; i < iters; i++) {
-      int cur_k = base_k + threadIdx.x;
-      int src_pos = perm_int_ptr[cur_k];
-
-      out_half[cur_k] = a_row_half[src_pos];
-
-      base_k += default_threads;
-    }
-
-    if (rest) {
-      if (threadIdx.x < rest) {
-        int cur_k = base_k + threadIdx.x;
-        int src_pos = perm_int_ptr[cur_k];
-
-        out_half[cur_k] = a_row_half[src_pos];
-      }
-    }
-  };
-
-  for (int i = 0; i < cur_block_rows; i++) {
-    int cur_row = start_row + i;
-    if (cur_row < size_m) {
-      permute_row(cur_row);
-    }
-  }
-}
-
-template <typename scalar_t,          // compute dtype, half or nv_float16
-          const int num_bits,         // number of bits used for weights
-          const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const bool has_act_order,    // whether act_order is enabled
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const int* __restrict__ g_idx,        // int32 group indices of shape k
-    int num_groups,  // number of scale groups per output channel
-    int prob_m,      // batch dimension m
-    int prob_n,      // output dimension n
-    int prob_k,      // reduction dimension k
-    int* locks       // extra global storage for barrier synchronization
-) {
-  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
-  // same size, which might involve multiple column "slices" (of width 16 *
-  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
-  // example:
-  //   0 1 3
-  //   0 2 3
-  //   1 2 4
-  // While this kind of partitioning makes things somewhat more complicated, it
-  // ensures good utilization of all SMs for many kinds of shape and GPU
-  // configurations, while requiring as few slow global cross-threadblock
-  // reductions as possible.
-  using Dtype = ScalarType<scalar_t>;
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  using FragA = typename ScalarType<scalar_t>::FragA;
-  using FragB = typename ScalarType<scalar_t>::FragB;
-  using FragC = typename ScalarType<scalar_t>::FragC;
-  using FragS = typename ScalarType<scalar_t>::FragS;
-
-  constexpr int pack_factor = 32 / num_bits;
-
-  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
-  // better partitioning with less reductions
-  int parallel = 1;
-  if (prob_m > 16 * thread_m_blocks) {
-    parallel = prob_m / (16 * thread_m_blocks);
-    prob_m = 16 * thread_m_blocks;
-  }
-
-  int k_tiles = prob_k / 16 / thread_k_blocks;
-  int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
-
-  if constexpr (!has_act_order && group_blocks != -1) {
-    if (group_blocks >= thread_k_blocks) {
-      // Ensure that the number of tiles in each stripe is a multiple of the
-      // groupsize; this avoids an annoying special case where a stripe starts
-      // in the middle of group.
-      iters = (group_blocks / thread_k_blocks) *
-              div_ceil(iters, (group_blocks / thread_k_blocks));
-    }
-  }
-
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
-
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
-    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
-    locks += (slice_col_par / n_tiles) * n_tiles;
-    slice_col = slice_col_par % n_tiles;
-  }
-
-  // Compute all information about the current slice which is required for
-  // synchronization.
-  auto init_slice = [&]() {
-    slice_iters =
-        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
-    if (slice_iters == 0) return;
-    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
-    slice_count = 1;
-    slice_idx = 0;
-    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
-    if (col_first <= k_tiles * (slice_col_par + 1)) {
-      int col_off = col_first - k_tiles * slice_col_par;
-      slice_count = div_ceil(k_tiles - col_off, iters);
-      if (col_off > 0) slice_count++;
-      int delta_first = iters * blockIdx.x - col_first;
-      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
-        slice_idx = slice_count - 1;
-      else {
-        slice_idx = slice_count - 1 - delta_first / iters;
-        if (col_off > 0) slice_idx--;
-      }
-    }
-    if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 8;
-      C += 16 * thread_m_blocks * prob_n / 8;
-      locks += n_tiles;
-      slice_col = 0;
-    }
-  };
-  init_slice();
-
-  // A sizes/strides
-
-  // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
-  // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
-  // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
-  // between subsequent accesses within a tile
-  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory writes
-  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
-  // within a shared memory tile
-  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
-  // overall size of a tile
-  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
-  // number of shared write iterations for a tile
-  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
-
-  // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
-  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
-
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
-  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
-  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
-
-  // Scale sizes/strides without act_order
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
-  constexpr int s_tb_groups =
-      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
-          ? thread_k_blocks / group_blocks
-          : 1;
-  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
-  int s_gl_rd_delta = s_gl_stride;
-
-  // Scale size/strides with act_order
-  constexpr int tb_k = 16 * thread_k_blocks;
-  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
-  // constexpr int act_s_row_stride      = 1;
-  // int           act_s_col_stride      = act_s_row_stride * num_groups;
-  int act_s_col_stride = 1;
-  int act_s_col_warp_stride = act_s_col_stride * 8;
-  int tb_n_warps = thread_n_blocks / 4;
-  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
-
-  // Global A read index of current thread.
-  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  a_gl_rd += a_gl_rd_delta_o * slice_row;
-  // Shared write index of current thread.
-  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  // Shared read index.
-  int a_sh_rd =
-      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
-
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
-  b_gl_rd += b_sh_stride * slice_col;
-  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
-
-  // For act_order
-  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
-  int slice_k_start = tb_k * slice_row;
-  int slice_k_finish = slice_k_start + tb_k * slice_iters;
-  int slice_k_start_shared_fetch = slice_k_start;
-  int slice_n_offset = act_s_col_tb_stride * slice_col;
-
-  // No act_order
-  int s_gl_rd;
-  if constexpr (!has_act_order) {
-    if constexpr (group_blocks == -1) {
-      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-    } else {
-      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
-                s_sh_stride * slice_col + threadIdx.x;
-    }
-  }
-  int s_sh_wr = threadIdx.x;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
-
-  // We use a different scale layout for grouped and column-wise quantization as
-  // we scale a `half2` tile in column-major layout in the former and in
-  // row-major in the latter case.
-  int s_sh_rd;
-  if constexpr (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
-
-  // Precompute which thread should not read memory in which iterations; this is
-  // needed if there are more threads than required for a certain tilesize or
-  // when the batchsize is not a multiple of 16.
-  bool a_sh_wr_pred[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
-
-  // To ensure that writing and reading A tiles to/from shared memory, the
-  // latter in fragment format, is fully bank conflict free, we need to use a
-  // rather fancy XOR-based layout. The key here is that neither reads nor
-  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
-  // same shared memory banks. Further, it seems (based on NSight-Compute) that
-  // each warp must also write a consecutive memory segment?
-  auto transform_a = [&](int i) {
-    int row = i / a_gl_rd_delta_o;
-    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
-  };
-  // Since the computation of this remapping is non-trivial and, due to our main
-  // loop unrolls, all shared memory accesses are static, we simply precompute
-  // both transformed reads and writes.
-  int a_sh_wr_trans[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
-  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-    for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
-  }
-
-  // Since B-accesses have non-constant stride they have to be computed at
-  // runtime; we break dependencies between subsequent accesses with a tile by
-  // maintining multiple pointers (we have enough registers), a tiny
-  // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
-
-  extern __shared__ int4 sh[];
-  // Shared memory storage for global fetch pipelines.
-  int4* sh_a = sh;
-  int4* sh_b = sh_a + (stages * a_sh_stage);
-  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
-  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
-
-  // Register storage for double buffer of shared memory reads.
-  FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2][b_thread_vecs];
-  FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];         // No act-order
-  FragS act_frag_s[2][4][4];  // For act-order
-
-  // Zero accumulators.
-  auto zero_accums = [&]() {
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
-      reinterpret_cast<float*>(frag_c)[i] = 0;
-  };
-
-  int sh_first_group_id = -1;
-  int sh_num_groups = -1;
-  constexpr int sh_max_num_groups = 32;
-
-  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
-                                    int last_group_id) {
-    sh_first_group_id = first_group_id;
-    sh_num_groups = last_group_id - first_group_id + 1;
-
-    if (sh_num_groups < sh_max_num_groups) {
-      sh_num_groups = sh_max_num_groups;
-    }
-
-    if (sh_first_group_id + sh_num_groups > num_groups) {
-      sh_num_groups = num_groups - sh_first_group_id;
-    }
-
-    int row_offset = first_group_id * s_gl_stride;
-
-    if (is_async) {
-      for (int i = 0; i < sh_num_groups; i++) {
-        if (threadIdx.x < s_sh_stride) {
-          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
-                         &scales_ptr[row_offset + (i * s_gl_stride) +
-                                     slice_n_offset + threadIdx.x]);
-        }
-      }
-    } else {
-      for (int i = 0; i < sh_num_groups; i++) {
-        if (threadIdx.x < s_sh_stride) {
-          sh_s[(i * s_sh_stride) + threadIdx.x] =
-              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
-                         threadIdx.x];
-        }
-      }
-    }
-  };
-  // Asynchronously fetch the next A, B and s tile from global to the next
-  // shared memory pipeline location.
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
-    if (pred) {
-      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < a_sh_wr_iters; i++) {
-        cp_async4_pred(
-            &sh_a_stage[a_sh_wr_trans[i]],
-            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
-            a_sh_wr_pred[i]);
-      }
-      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
-        }
-
-        B_ptr[i] += b_gl_rd_delta_o;
-      }
-
-      if constexpr (has_act_order) {
-        // Fetch g_idx thread-block portion
-        int full_pipe = a_off;
-        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
-        if (cur_k < prob_k && cur_k < slice_k_finish) {
-          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
-
-          int4 const* cur_g_idx_stage_ptr =
-              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
-
-          if (threadIdx.x < g_idx_stage) {
-            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
-                           &cur_g_idx_stage_ptr[threadIdx.x]);
-          }
-        }
-      } else {
-        if constexpr (group_blocks != -1) {
-          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < s_tb_groups; i++) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                          &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          }
-        }
-      }
-    }
-    // Insert a fence even when we are winding down the pipeline to ensure that
-    // waiting is also correct at this point.
-    cp_async_fence();
-  };
-
-  // Wait until the next thread tile has been loaded to shared memory.
-  auto wait_for_stage = [&]() {
-    // We only have `stages - 2` active fetches since we are double buffering
-    // and can only issue the next fetch when it is guaranteed that the previous
-    // shared memory load is fully complete (as it may otherwise be
-    // overwritten).
-    cp_async_wait<stages - 2>();
-    __syncthreads();
-  };
-
-  // Load the next sub-tile from the current location in the shared memory pipe
-  // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe) {
-    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks; i++)
-      ldsm4<scalar_t>(frag_a[k % 2][i],
-                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
-    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-
-  #pragma unroll
-    for (int i = 0; i < b_thread_vecs; i++) {
-      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
-    }
-  };
-
-  bool is_same_group[stages];
-  int same_group_id[stages];
-
-  auto init_same_group = [&](int pipe) {
-    if constexpr (!has_act_order) {
-      is_same_group[pipe] = false;
-      same_group_id[pipe] = 0;
-      return;
-    }
-
-    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
-    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
-
-    int group_id_1 = sh_g_idx_int_ptr[0];
-    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
-
-    is_same_group[pipe] = group_id_1 == group_id_2;
-    same_group_id[pipe] = group_id_1;
-  };
-
-  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
-    int pipe = full_pipe % stages;
-
-    if constexpr (!has_act_order) {
-      // No act-order case
-      if constexpr (group_blocks != -1) {
-        if constexpr (group_blocks >= thread_k_blocks) {
-          int4* sh_s_stage =
-              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                   (pipe / (group_blocks / thread_k_blocks)));
-          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-        } else {
-          int warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
-
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          int cur_group_id = k_blocks / group_blocks;
-
-          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-
-          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
-              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-        }
-      }
-
-      return;
-    }
-
-    // Act-order case
-
-    // Determine K of the "current" thread-block
-    int cur_k = slice_k_start + tb_k * full_pipe;
-    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
-      return;
-    }
-
-    // Reset (to current thread-block) since we read g_idx portion from the
-    // shared memory
-    cur_k = 0;
-
-    // Progress to current iteration
-    cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-    // Determine "position" inside the thread-block (based on warp and
-    // thread-id)
-    int warp_id = threadIdx.x / 32;
-    int n_warps =
-        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
-
-    int warp_row = warp_id / n_warps;
-    int warp_col = warp_id % n_warps;
-
-    cur_k += warp_row * 16;
-
-    int th_id = threadIdx.x % 32;
-    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
-
-    int s_col_shift =
-        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
-        (th_id / 4) * act_s_col_stride;
-
-    if (is_same_group[pipe]) {
-      if (k % 2 == 0) {
-        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
-            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
-                 s_col_shift];
-      } else {
-        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
-            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
-      }
-
-      for (int i = 1; i < 4; i++) {
-        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
-            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
-      }
-      return;
-    }
-
-    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
-    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
-
-    constexpr int k_frag_offsets[4] = {0, 1, 8,
-                                       9};  // Tensor core offsets per thread
-
-  #pragma unroll
-    for (int i = 0; i < 4; i++) {
-      int actual_k = cur_k + k_frag_offsets[i];
-
-      int group_id = sh_g_idx_int_ptr[actual_k];
-      int rel_group_id = group_id - sh_first_group_id;
-
-      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
-          sh_s[rel_group_id * s_sh_stride + s_col_shift];
-    }
-  };
-
-  // Execute the actual tensor core matmul of a sub-tile.
-  auto matmul = [&](int k) {
-  // We have the m dimension as the inner loop in order to encourage overlapping
-  // dequantization and matmul operations.
-  #pragma unroll
-    for (int j = 0; j < 4; j++) {
-      FragB frag_b0;
-      FragB frag_b1;
-      if constexpr (num_bits == 4) {
-        int b_quant = frag_b_quant[k % 2][0][j];
-        int b_quant_shift = b_quant >> 8;
-
-        frag_b0 = dequant_4bit<scalar_t>(b_quant);
-        frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);
-
-      } else {
-        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
-        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
-        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
-
-        frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
-        frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
-      }
-
-      // Apply scale to frag_b0
-      if constexpr (has_act_order) {
-        scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
-                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
-                         act_frag_s[k % 2][3][j], 0);
-      } else {
-        if constexpr (group_blocks != -1) {
-          scale<scalar_t>(frag_b0, frag_s[k % 2][j], 0);
-        }
-      }
-
-      // Apply scale to frag_b1
-      if constexpr (has_act_order) {
-        scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
-                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
-                         act_frag_s[k % 2][3][j], 1);
-
-      } else {
-        if constexpr (group_blocks != -1) {
-          scale<scalar_t>(frag_b1, frag_s[k % 2][j], 1);
-        }
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
-        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
-      }
-    }
-  };
-
-  // Since we slice across the k dimension of a tile in order to increase the
-  // number of warps while keeping the n dimension of a tile reasonable, we have
-  // multiple warps that accumulate their partial sums of the same output
-  // location; which we have to reduce over in the end. We do in shared memory.
-  auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride_threads / 2;
-    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride_threads;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
-                      (threadIdx.x % b_sh_stride_threads);
-
-      // Parallel logarithmic shared memory reduction. We make sure to avoid any
-      // unnecessary read or write iterations, e.g., for two warps we write only
-      // once by warp 1 and read only once by warp 0.
-
-  #pragma unroll
-      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
-  #pragma unroll
-        for (int i = red_off; i > 0; i /= 2) {
-          if (i <= red_idx && red_idx < 2 * i) {
-  #pragma unroll
-            for (int j = 0; j < 4 * 2; j++) {
-              int red_sh_wr =
-                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
-              if (i < red_off) {
-                float* c_rd =
-                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
-                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
-  #pragma unroll
-                for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
-                      c_rd[k] + c_wr[k];
-              }
-              sh[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
-            }
-          }
-          __syncthreads();
-        }
-        if (red_idx == 0) {
-  #pragma unroll
-          for (int i = 0; i < 4 * 2; i++) {
-            float* c_rd =
-                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
-  #pragma unroll
-            for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
-          }
-        }
-        __syncthreads();
-      }
-    }
-  };
-
-  // Since multiple threadblocks may process parts of the same column slice, we
-  // finally have to globally reduce over the results. As the striped
-  // partitioning minimizes the number of such reductions and our outputs are
-  // usually rather small, we perform this reduction serially in L2 cache.
-  auto global_reduce = [&](bool first = false, bool last = false) {
-    // We are very careful here to reduce directly in the output buffer to
-    // maximize L2 cache utilization in this step. To do this, we write out
-    // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
-    if (threadIdx.x < active_threads) {
-      int c_gl_stride = prob_n / 8;
-      int c_gl_wr_delta_o = 8 * c_gl_stride;
-      int c_gl_wr_delta_i = 4 * (active_threads / 32);
-      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
-                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
-      c_gl_wr += (2 * thread_n_blocks) * slice_col;
-      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
-
-      int row = (threadIdx.x % 32) / 4;
-
-      if (!first) {
-  // Interestingly, doing direct global accesses here really seems to mess up
-  // the compiler and lead to slowdowns, hence we also use async-copies even
-  // though these fetches are not actually asynchronous.
-  #pragma unroll
-        for (int i = 0; i < thread_m_blocks * 4; i++) {
-          cp_async4_pred(
-              &sh[c_sh_wr + c_sh_wr_delta * i],
-              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                 c_gl_wr_delta_i * (i % 2)],
-              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
-        }
-        cp_async_fence();
-        cp_async_wait<0>();
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks * 4; i++) {
-        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
-          if (!first) {
-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
-  #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
-              reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
-                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
-            }
-          }
-          if (!last) {
-            int4 c;
-  #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
-              reinterpret_cast<scalar_t*>(&c)[j] =
-                  Dtype::float2num(reinterpret_cast<float*>(
-                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
-            }
-            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
-                c;
-          }
-        }
-      }
-    }
-  };
-
-  // Write out the reduce final result in the correct layout. We only actually
-  // reshuffle matrix fragments in this step, the reduction above is performed
-  // in fragment layout.
-  auto write_result = [&]() {
-    int c_gl_stride = prob_n / 8;
-    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
-    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
-    constexpr int c_sh_rd_delta =
-        c_sh_stride * (threads / (2 * thread_n_blocks));
-
-    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-    c_gl_wr += (2 * thread_n_blocks) * slice_col;
-    int c_sh_wr =
-        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-    c_sh_wr += 32 * (threadIdx.x / 32);
-    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-
-    int c_gl_wr_end = c_gl_stride * prob_m;
-
-    // We first reorder in shared memory to guarantee the most efficient final
-    // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
-      scalar_t2 res =
-          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
-
-      // For per-column quantization we finally apply the scale here (only for
-      // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
-        res = __hmul2(res, s[0]);
-      }
-
-      ((scalar_t2*)sh)[idx] = res;
-    };
-
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-  #pragma unroll
-        for (int j = 0; j < 4; j++) {
-          int wr = c_sh_wr + 8 * j;
-          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
-          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
-        }
-        c_sh_wr += 16 * (4 * c_sh_stride);
-      }
-    }
-    __syncthreads();
-
-  #pragma unroll
-    for (int i = 0;
-         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
-         i++) {
-      if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh[c_sh_rd];
-        c_gl_wr += c_gl_wr_delta;
-        c_sh_rd += c_sh_rd_delta;
-      }
-    }
-  };
-
-  // Start global fetch and register load pipelines.
-  auto start_pipes = [&]() {
-
-  #pragma unroll
-    for (int i = 0; i < stages - 1; i++) {
-      if (has_act_order && i == 0) {
-        int last_g_idx = slice_k_start + stages * tb_k * 2;
-        if (last_g_idx >= prob_k) {
-          last_g_idx = prob_k - 1;
-        }
-        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
-      }
-      fetch_to_shared(i, i, i < slice_iters);
-    }
-
-    zero_accums();
-    wait_for_stage();
-    init_same_group(0);
-    fetch_to_registers(0, 0);
-    fetch_scales_to_registers(0, 0);
-    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
-    slice_k_start_shared_fetch += tb_k * (stages - 1);
-  };
-  if (slice_iters) {
-    start_pipes();
-  }
-
-  // Main loop.
-  while (slice_iters) {
-    // We unroll over both the global fetch and the register load pipeline to
-    // ensure all shared memory accesses are static. Note that both pipelines
-    // have even length meaning that the next iteration will always start at
-    // index 0.
-
-  #pragma unroll
-    for (int pipe = 0; pipe < stages;) {
-  #pragma unroll
-      for (int k = 0; k < b_sh_wr_iters; k++) {
-        fetch_to_registers(k + 1, pipe % stages);
-        fetch_scales_to_registers(k + 1, pipe);
-        if (k == b_sh_wr_iters - 2) {
-          fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                          slice_iters >= stages);
-          pipe++;
-          wait_for_stage();
-          init_same_group(pipe % stages);
-        }
-        matmul(k);
-      }
-      slice_iters--;
-      if (slice_iters == 0) {
-        break;
-      }
-    }
-
-    a_gl_rd += a_gl_rd_delta_o * stages;
-    slice_k_start += tb_k * stages;
-    slice_k_start_shared_fetch += tb_k * stages;
-
-    if constexpr (has_act_order) {
-      int first_group_id = g_idx[slice_k_start];
-      int last_g_idx = slice_k_start + stages * tb_k * 2;
-      if (last_g_idx >= prob_k) {
-        last_g_idx = prob_k - 1;
-      }
-      int last_group_id = g_idx[last_g_idx];
-      if (last_group_id >= sh_first_group_id + sh_num_groups) {
-        fetch_scales_to_shared(false, first_group_id, last_group_id);
-        __syncthreads();
-      }
-    }
-
-    // Process results and, if necessary, proceed to the next column slice.
-    // While this pattern may not be the most readable, other ways of writing
-    // the loop seemed to noticeably worse performance after compilation.
-    if (slice_iters == 0) {
-      cp_async_wait<0>();
-      bool last = slice_idx == slice_count - 1;
-      // For per-column scales, we only fetch them here in the final step before
-      // write-out
-      if constexpr (!has_act_order && group_blocks == -1) {
-        if constexpr (num_bits == 8) {
-          if (s_sh_wr_pred) {
-            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
-          }
-          cp_async_fence();
-        } else {
-          if (last) {
-            if (s_sh_wr_pred) {
-              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
-            }
-            cp_async_fence();
-          }
-        }
-      }
-
-      thread_block_reduce();
-      if constexpr (!has_act_order && group_blocks == -1) {
-        if constexpr (num_bits == 8) {
-          cp_async_wait<0>();
-          __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
-            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
-            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
-          }
-
-        } else {
-          if (last) {
-            cp_async_wait<0>();
-            __syncthreads();
-            if (threadIdx.x / 32 < thread_n_blocks / 4) {
-              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
-              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
-            }
-          }
-        }
-      }
-
-      // For 8-bit channelwise, we apply the scale before the global reduction
-      // that converts the fp32 results to fp16 (so that we avoid possible
-      // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-          for (int i = 0; i < thread_m_blocks; i++) {
-  #pragma unroll
-            for (int j = 0; j < 4; j++) {
-              scale_float<scalar_t>(
-                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
-                  frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float<scalar_t>(
-                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
-                  frag_s[j / 2][2 * (j % 2) + 0]);
-
-              scale_float<scalar_t>(
-                  reinterpret_cast<float*>(&frag_c[i][j][1][0]),
-                  frag_s[j / 2][2 * (j % 2) + 1]);
-              scale_float<scalar_t>(
-                  reinterpret_cast<float*>(&frag_c[i][j][1][2]),
-                  frag_s[j / 2][2 * (j % 2) + 1]);
-            }
-          }
-        }
-      }
-
-      if (slice_count > 1) {  // only globally reduce if there is more than one
-                              // block in a slice
-        barrier_acquire(&locks[slice_col], slice_idx);
-        global_reduce(slice_idx == 0, last);
-        barrier_release(&locks[slice_col], last);
-      }
-      if (last)  // only the last block in a slice actually writes the result
-        write_result();
-      slice_row = 0;
-      slice_col_par++;
-      slice_col++;
-      init_slice();
-      if (slice_iters) {
-        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                  (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
-
-        // Update slice k/n for scales loading
-        if constexpr (has_act_order) {
-          slice_k_start = tb_k * slice_row;
-          slice_k_finish = slice_k_start + tb_k * slice_iters;
-          slice_k_start_shared_fetch = slice_k_start;
-          slice_n_offset = act_s_col_tb_stride * slice_col;
-
-        } else {
-          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-        }
-
-        start_pipes();
-      }
-    }
-  }
-}
-
-  #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
-                    THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \
-    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
-             num_threads == NUM_THREADS) {                                     \
-      cudaFuncSetAttribute(                                                    \
-          Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
-                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
-                 GROUP_BLOCKS>,                                                \
-          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
-      Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
-             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
-             GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>(   \
-          A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n,   \
-          prob_k, locks);                                                      \
-    }
-
-typedef struct {
-  int thread_k;
-  int thread_n;
-  int num_threads;
-} thread_config_t;
-
-typedef struct {
-  int max_m_blocks;
-  thread_config_t tb_cfg;
-} exec_config_t;
-
-thread_config_t small_batch_thread_configs[] = {
-    // Ordered by priority
-
-    // thread_k, thread_n, num_threads
-    {128, 128, 256},
-    {64, 128, 128},
-    {128, 64, 128},
-};
-
-thread_config_t large_batch_thread_configs[] = {
-    // Ordered by priority
-
-    // thread_k, thread_n, num_threads
-    {64, 256, 256},
-    {64, 128, 128},
-    {128, 64, 128},
-
-};
-
-int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
-                          int prob_n, int prob_k, int num_bits, int group_size,
-                          bool has_act_order, bool is_k_full) {
-  bool cache_scales_chunk = has_act_order && !is_k_full;
-
-  int tb_n = th_config.thread_n;
-  int tb_k = th_config.thread_k;
-
-  // Get max scale groups per thread-block
-  int tb_groups;
-  if (group_size == -1) {
-    tb_groups = 1;
-  } else if (group_size == 0) {
-    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
-  } else {
-    tb_groups = div_ceil(tb_k, group_size);
-  }
-
-  if (cache_scales_chunk) {
-    int load_groups =
-        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
-    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
-    return load_groups * tb_n * 2;
-
-  } else {
-    int tb_scales = tb_groups * tb_n * 2;
-
-    return tb_scales * pipe_stages;
-  }
-}
-
-bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
-                         int prob_m, int prob_n, int prob_k, int num_bits,
-                         int scales_cache_size, int max_shared_mem) {
-  int pack_factor = 32 / num_bits;
-
-  // Get B size
-  int tb_k = th_config.thread_k;
-  int tb_n = th_config.thread_n;
-
-  int b_size = (tb_k * tb_n / pack_factor) * 4;
-
-  // Get A size
-  int m_blocks = div_ceil(prob_m, 16);
-  int tb_max_m = 16;
-
-  while (true) {
-    if (m_blocks >= max_m_blocks) {
-      tb_max_m *= max_m_blocks;
-      break;
-    }
-
-    max_m_blocks--;
-    if (max_m_blocks == 0) {
-      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
-    }
-  }
-
-  int a_size = (tb_max_m * tb_k) * 2;
-
-  float pipe_size = (a_size + b_size) * pipe_stages;
-
-  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
-
-  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
-}
-
-bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
-                     int prob_m, int prob_n, int prob_k, int num_bits,
-                     int group_size, bool has_act_order, bool is_k_full,
-                     int max_shared_mem) {
-  // Sanity
-  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
-      th_config.num_threads == -1) {
-    return false;
-  }
-
-  // Verify K/N are divisible by thread K/N
-  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
-    return false;
-  }
-
-  // Verify min for thread K/N
-  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
-    return false;
-  }
-
-  // num_threads must be at least 128 (= 4 warps)
-  if (th_config.num_threads < 128) {
-    return false;
-  }
-
-  //  Determine cache for scales
-  int scales_cache_size =
-      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
-                            group_size, has_act_order, is_k_full);
-
-  // Check that pipeline fits into cache
-  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
-                           num_bits, scales_cache_size, max_shared_mem)) {
-    return false;
-  }
-
-  return true;
-}
-
-exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
-                                      int num_bits, int group_size,
-                                      bool has_act_order, bool is_k_full,
-                                      int max_shared_mem) {
-  int max_m_blocks = 4;
-  while (max_m_blocks > 0) {
-    if (prob_m <= 16) {
-      for (auto th_config : small_batch_thread_configs) {
-        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
-                            num_bits, group_size, has_act_order, is_k_full,
-                            max_shared_mem)) {
-          return exec_config_t{max_m_blocks, th_config};
-        }
-      }
-    } else {
-      for (auto th_config : large_batch_thread_configs) {
-        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
-                            num_bits, group_size, has_act_order, is_k_full,
-                            max_shared_mem)) {
-          return exec_config_t{max_m_blocks, th_config};
-        }
-      }
-    }
-
-    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
-                     // usage
-  }
-
-  return exec_config_t{0, {-1, -1, -1}};
-}
-
-  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-                                                                       \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
-
-template <typename scalar_t>
-void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
-                     void* g_idx, void* perm, void* a_tmp, int prob_m,
-                     int prob_n, int prob_k, void* workspace, int num_bits,
-                     bool has_act_order, bool is_k_full, int num_groups,
-                     int group_size, int dev, cudaStream_t stream, int thread_k,
-                     int thread_n, int sms, int max_par) {
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
-  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
-              ", ", prob_n, ", ", prob_k, "]");
-
-  int tot_m = prob_m;
-  int tot_m_blocks = div_ceil(tot_m, 16);
-  int pad = 16 * tot_m_blocks - tot_m;
-
-  if (sms == -1) {
-    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
-  }
-
-  int max_shared_mem = 0;
-  cudaDeviceGetAttribute(&max_shared_mem,
-                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-  TORCH_CHECK(max_shared_mem > 0);
-
-  // Set thread config
-  exec_config_t exec_cfg;
-  if (thread_k != -1 && thread_n != -1) {
-    // User-defined config
-    exec_cfg =
-        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
-  } else {
-    // Auto config
-    exec_cfg =
-        determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size,
-                                has_act_order, is_k_full, max_shared_mem);
-  }
-
-  TORCH_CHECK(exec_cfg.max_m_blocks > 0 &&
-                  is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks,
-                                  prob_m, prob_n, prob_k, num_bits, group_size,
-                                  has_act_order, is_k_full, max_shared_mem),
-              "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
-              ", thread_k = ", exec_cfg.tb_cfg.thread_k,
-              ", thread_n = ", exec_cfg.tb_cfg.thread_n,
-              ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [",
-              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
-              ", group_size = ", group_size,
-              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
-              ", max_shared_mem = ", max_shared_mem);
-
-  int num_threads = exec_cfg.tb_cfg.num_threads;
-  thread_k = exec_cfg.tb_cfg.thread_k;
-  thread_n = exec_cfg.tb_cfg.thread_n;
-
-  int thread_k_blocks = thread_k / 16;
-  int thread_n_blocks = thread_n / 16;
-
-  int blocks = sms;
-
-  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
-              " is not divisible by thread_n = ", thread_n);
-  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
-              " is not divisible by thread_k = ", thread_k);
-
-  int group_blocks = 0;
-  if (has_act_order) {
-    if (is_k_full) {
-      TORCH_CHECK(group_size != -1);
-      group_blocks = group_size / 16;
-      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
-                  " is not divisible by group_blocks = ", group_blocks);
-    } else {
-      TORCH_CHECK(group_size == 0);
-      group_blocks = 0;
-    }
-
-  } else {
-    if (group_size == -1) {
-      group_blocks = -1;
-    } else {
-      group_blocks = group_size / 16;
-      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
-                  " is not divisible by group_blocks = ", group_blocks);
-    }
-  }
-
-  const int4* A_ptr = (const int4*)A;
-  const int4* B_ptr = (const int4*)B;
-  int4* C_ptr = (int4*)C;
-  const int4* s_ptr = (const int4*)s;
-  const int* g_idx_ptr = (const int*)g_idx;
-  const int* perm_ptr = (const int*)perm;
-  int4* a_tmp_ptr = (int4*)a_tmp;
-
-  int* locks = (int*)workspace;
-
-  if (has_act_order) {
-    // Permute A columns
-    int block_rows = div_ceil(prob_m, blocks);
-    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
-        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
-    A_ptr = a_tmp_ptr;
-  }
-
-  // If we have a full K, then we can run the non-act-order version of Marlin
-  // (since the weight rows are reordered by increasing group ids, and by having
-  // a full K, we have full original groups)
-  if (is_k_full) {
-    has_act_order = false;
-  }
-
-  // Main loop
-  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
-    int thread_m_blocks = tot_m_blocks - i;
-    prob_m = tot_m - 16 * i;
-    int par = 1;
-    if (thread_m_blocks > exec_cfg.max_m_blocks) {
-      // Note that parallel > 1 currently only works for inputs without any
-      // padding
-      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
-      if (par > max_par) par = max_par;
-      prob_m = (16 * exec_cfg.max_m_blocks) * par;
-      i += exec_cfg.max_m_blocks * (par - 1);
-      thread_m_blocks = exec_cfg.max_m_blocks;
-    }
-
-    // Define kernel configurations
-    if (false) {
-    }
-    CALL_IF(4, 32, 2, 256)
-    CALL_IF(4, 16, 4, 256)
-    CALL_IF(4, 8, 8, 256)
-    CALL_IF(4, 8, 4, 128)
-    CALL_IF(4, 4, 8, 128)
-    CALL_IF(8, 32, 2, 256)
-    CALL_IF(8, 16, 4, 256)
-    CALL_IF(8, 8, 8, 256)
-    CALL_IF(8, 8, 4, 128)
-    CALL_IF(8, 4, 8, 128)
-    else {
-      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
-                             str(prob_n) + ", " + str(prob_k) + "]" +
-                             ", has_act_order = " + str(has_act_order) +
-                             ", num_groups = " + str(num_groups) +
-                             ", group_size = " + str(group_size) +
-                             ", thread_m_blocks = " + str(thread_m_blocks) +
-                             ", thread_n_blocks = " + str(thread_n_blocks) +
-                             ", thread_k_blocks = " + str(thread_k_blocks));
-    }
-
-    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
-    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
-  }
-}
-
-}  // namespace gptq_marlin
-
-torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                               torch::Tensor& b_scales, torch::Tensor& g_idx,
-                               torch::Tensor& perm, torch::Tensor& workspace,
-                               int64_t num_bits, int64_t size_m, int64_t size_n,
-                               int64_t size_k, bool is_k_full) {
-  // Verify num_bits
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
-  int pack_factor = 32 / num_bits;
-
-  // Verify A
-  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
-              ", size_m = ", size_m);
-  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
-              ", size_k = ", size_k);
-
-  // Verify B
-  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
-              " is not divisible by tile_size = ", gptq_marlin::tile_size);
-  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
-              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
-              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
-  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
-              "b_q_weight.size(1) = ", b_q_weight.size(1),
-              " is not divisible by tile_size = ", gptq_marlin::tile_size);
-  int actual_size_n =
-      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
-  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
-              ", actual_size_n = ", actual_size_n);
-
-  // Verify device and strides
-  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
-
-  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
-  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
-
-  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
-  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
-
-  TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
-  TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
-
-  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
-  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
-
-  // Alloc buffers
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  torch::Tensor c = torch::empty({size_m, size_n}, options);
-  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
-
-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_k = -1;
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_n = -1;
-  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
-  int sms = -1;
-
-  // Verify g_idx and perm
-  TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
-                  (g_idx.size(0) == size_k && perm.size(0) == size_k),
-              "Unexpected g_idx.size(0) = ", g_idx.size(0),
-              " and perm.size(0) = ", perm.size(0),
-              ", where size_k = ", size_k);
-
-  // Detect groupsize and act_order
-  int num_groups = -1;
-  int group_size = -1;
-  bool has_act_order = g_idx.size(0) != 0;
-
-  int b_rank = b_scales.sizes().size();
-  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
-  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
-              " is not size_n = ", size_n);
-  num_groups = b_scales.size(0);
-
-  if (has_act_order) {
-    if (is_k_full) {
-      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
-      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
-                  ", is not divisible by num_groups = ", num_groups);
-      group_size = size_k / num_groups;
-    } else {
-      group_size = 0;
-    }
-
-  } else {
-    if (num_groups > 1) {
-      TORCH_CHECK(
-          size_k % num_groups == 0, "size_k = ", size_k,
-          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
-      group_size = size_k / num_groups;
-    } else {
-      group_size = -1;
-    }
-  }
-
-  // Verify workspace size
-  TORCH_CHECK(
-      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
-      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
-  int min_workspace_size =
-      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
-  TORCH_CHECK(workspace.numel() >= min_workspace_size,
-              "workspace.numel = ", workspace.numel(),
-              " is below min_workspace_size = ", min_workspace_size);
-
-  int dev = a.get_device();
-  if (a.scalar_type() == at::ScalarType::Half) {
-    gptq_marlin::marlin_mm_f16i4<half>(
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        b_scales.data_ptr<at::Half>(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
-        workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups,
-        group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
-        thread_n, sms, gptq_marlin::max_par);
-  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
-        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order,
-        is_k_full, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        gptq_marlin::max_par);
-  } else {
-    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
-  }
-
-  return c;
-}
-
-#endif
diff --git a/server/marlin/marlin_kernels/gptq_marlin.cuh b/server/marlin/marlin_kernels/gptq_marlin.cuh
deleted file mode 100644
index 42af44951efdab0b04f8f1f267112dc70317e820..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/gptq_marlin.cuh
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <iostream>
-
-namespace gptq_marlin {
-
-// 8 warps are a good choice since every SM has 4 schedulers and having more
-// than 1 warp per schedule allows some more latency hiding. At the same time,
-// we want relatively few warps to have many registers per warp and small tiles.
-static constexpr int default_threads = 256;
-
-static constexpr int pipe_stages =
-    4;  // 4 pipeline stages fit into shared memory
-
-static constexpr int min_thread_n = 64;
-static constexpr int min_thread_k = 64;
-
-static constexpr int tile_size = 16;
-static constexpr int max_par = 16;
-
-template <typename T, int n>
-struct Vec {
-  T elems[n];
-  __device__ T& operator[](int i) { return elems[i]; }
-};
-
-using I4 = Vec<int, 4>;
-
-constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-// No support for async
-#else
-
-__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
-                                      bool pred = true) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   .reg .pred p;\n"
-      "   setp.ne.b32 p, %0, 0;\n"
-      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-      "}\n" ::"r"((int)pred),
-      "r"(smem), "l"(glob_ptr), "n"(BYTES));
-}
-
-__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
-}
-
-__device__ inline void cp_async_fence() {
-  asm volatile("cp.async.commit_group;\n" ::);
-}
-
-template <int n>
-__device__ inline void cp_async_wait() {
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
-}
-
-#endif
-
-}  // namespace gptq_marlin
diff --git a/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh b/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh
deleted file mode 100644
index ca1b7099d6ec76c02c329551db52a6279f7581f8..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh
+++ /dev/null
@@ -1,77 +0,0 @@
-
-#ifndef _data_types_cuh
-#define _data_types_cuh
-#include "gptq_marlin.cuh"
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-
-namespace gptq_marlin {
-
-template <typename scalar_t>
-class ScalarType {};
-
-template <>
-class ScalarType<half> {
- public:
-  using scalar_t = half;
-  using scalar_t2 = half2;
-
-  // Matrix fragments for tensor core instructions; their precise layout is
-  // documented here:
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
-  using FragA = Vec<half2, 4>;
-  using FragB = Vec<half2, 2>;
-  using FragC = Vec<float, 4>;
-  using FragS = Vec<half2, 1>;
-
-  static __device__ float inline num2float(const half x) {
-    return __half2float(x);
-  }
-
-  static __device__ half2 inline num2num2(const half x) {
-    return __half2half2(x);
-  }
-
-  static __device__ half2 inline nums2num2(const half x1, const half x2) {
-    return __halves2half2(x1, x2);
-  }
-
-  static __host__ __device__ half inline float2num(const float x) {
-    return __float2half(x);
-  }
-};
-
-template <>
-class ScalarType<nv_bfloat16> {
- public:
-  using scalar_t = nv_bfloat16;
-  using scalar_t2 = nv_bfloat162;
-
-  using FragA = Vec<nv_bfloat162, 4>;
-  using FragB = Vec<nv_bfloat162, 2>;
-  using FragC = Vec<float, 4>;
-  using FragS = Vec<nv_bfloat162, 1>;
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-  static __device__ float inline num2float(const nv_bfloat16 x) {
-    return __bfloat162float(x);
-  }
-
-  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
-    return __bfloat162bfloat162(x);
-  }
-
-  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
-                                                  const nv_bfloat16 x2) {
-    return __halves2bfloat162(x1, x2);
-  }
-
-  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
-    return __float2bfloat16(x);
-  }
-#endif
-};
-
-}  // namespace gptq_marlin
-
-#endif
diff --git a/server/marlin/marlin_kernels/gptq_marlin_repack.cu b/server/marlin/marlin_kernels/gptq_marlin_repack.cu
deleted file mode 100644
index 4adc158eb14eaec10046905e0a5005c4a6bfa363..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/gptq_marlin_repack.cu
+++ /dev/null
@@ -1,350 +0,0 @@
-#include "gptq_marlin.cuh"
-
-namespace gptq_marlin {
-
-static constexpr int repack_stages = 8;
-
-static constexpr int repack_threads = 256;
-
-static constexpr int tile_k_size = tile_size;
-static constexpr int tile_n_size = tile_k_size * 4;
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-template <int const num_threads, int const num_bits, bool const has_perm>
-__global__ void marlin_repack_kernel(
-    uint32_t const* __restrict__ b_q_weight_ptr,
-    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
-    int size_k, int size_n) {}
-
-}  // namespace gptq_marlin
-
-torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
-                                 int64_t size_k, int64_t size_n,
-                                 int64_t num_bits) {
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
-  return torch::empty({1, 1});
-}
-
-#else
-
-template <int const num_threads, int const num_bits, bool const has_perm>
-__global__ void marlin_repack_kernel(
-    uint32_t const* __restrict__ b_q_weight_ptr,
-    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
-    int size_k, int size_n) {
-  constexpr int pack_factor = 32 / num_bits;
-
-  int k_tiles = size_k / tile_k_size;
-  int n_tiles = size_n / tile_n_size;
-  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
-
-  int start_k_tile = blockIdx.x * block_k_tiles;
-  if (start_k_tile >= k_tiles) {
-    return;
-  }
-
-  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
-
-  // Wait until the next thread tile has been loaded to shared memory.
-  auto wait_for_stage = [&]() {
-    // We only have `stages - 2` active fetches since we are double buffering
-    // and can only issue the next fetch when it is guaranteed that the previous
-    // shared memory load is fully complete (as it may otherwise be
-    // overwritten).
-    cp_async_wait<repack_stages - 2>();
-    __syncthreads();
-  };
-
-  extern __shared__ int4 sh[];
-
-  constexpr int perm_size = tile_k_size / 4;
-
-  int4* sh_perm_ptr = sh;
-  int4* sh_pipe_ptr = sh_perm_ptr;
-  if constexpr (has_perm) {
-    sh_pipe_ptr += perm_size;
-  }
-
-  constexpr int tile_ints = tile_k_size / pack_factor;
-
-  constexpr int stage_n_threads = tile_n_size / 4;
-  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
-  constexpr int stage_size = stage_k_threads * stage_n_threads;
-
-  auto load_perm_to_shared = [&](int k_tile_id) {
-    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
-
-    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
-
-    if (threadIdx.x < perm_size) {
-      sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
-    }
-    __syncthreads();
-  };
-
-  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
-    if (n_tile_id >= n_tiles) {
-      cp_async_fence();
-      return;
-    }
-
-    int first_n = n_tile_id * tile_n_size;
-
-    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
-
-    if constexpr (has_perm) {
-      if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
-
-        uint32_t const* sh_perm_int_ptr =
-            reinterpret_cast<uint32_t const*>(sh_perm_ptr);
-
-        int src_k = sh_perm_int_ptr[k_id];
-        int src_k_packed = src_k / pack_factor;
-
-        cp_async4(
-            &sh_ptr[k_id * stage_n_threads + n_id],
-            reinterpret_cast<int4 const*>(&(
-                b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
-      }
-
-    } else {
-      if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
-
-        int first_k = k_tile_id * tile_k_size;
-        int first_k_packed = first_k / pack_factor;
-
-        cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
-                  reinterpret_cast<int4 const*>(
-                      &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
-                                       first_n + (n_id * 4)])));
-      }
-    }
-
-    cp_async_fence();
-  };
-
-  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
-    if (n_tile_id >= n_tiles) {
-      return;
-    }
-
-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
-
-    if (warp_id >= 4) {
-      return;
-    }
-
-    int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
-
-    constexpr int tc_offsets[4] = {0, 1, 8, 9};
-
-    int cur_n = warp_id * 16 + tc_col;
-
-    constexpr int sh_stride = 64;
-    constexpr uint32_t mask = (1 << num_bits) - 1;
-
-    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
-    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
-
-    uint32_t* sh_perm_int_ptr = reinterpret_cast<uint32_t*>(sh_perm_ptr);
-
-    uint32_t vals[8];
-
-    if constexpr (has_perm) {
-      for (int i = 0; i < 4; i++) {
-        int k_idx = tc_row + tc_offsets[i];
-
-        uint32_t src_k = sh_perm_int_ptr[k_idx];
-        uint32_t src_k_pos = src_k % pack_factor;
-
-        uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
-        uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
-
-        uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
-        uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
-
-        vals[i] = b1_cur_val;
-        vals[4 + i] = b2_cur_val;
-      }
-
-    } else {
-      uint32_t b1_vals[tile_ints];
-      uint32_t b2_vals[tile_ints];
-
-  #pragma unroll
-      for (int i = 0; i < tile_ints; i++) {
-        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
-        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
-      }
-
-  #pragma unroll
-      for (int i = 0; i < 4; i++) {
-        int cur_elem = tc_row + tc_offsets[i];
-        int cur_int = cur_elem / pack_factor;
-        int cur_pos = cur_elem % pack_factor;
-
-        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
-        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
-      }
-    }
-
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
-    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
-
-    // Result of:
-    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-
-      uint32_t res = 0;
-  #pragma unroll
-      for (int i = 0; i < 8; i++) {
-        res |= vals[pack_idx[i]] << (i * 4);
-      }
-
-      out_ptr[out_offset + th_id * 4 + warp_id] = res;
-
-    } else {
-      constexpr int pack_idx[4] = {0, 2, 1, 3};
-
-      uint32_t res1 = 0;
-      uint32_t res2 = 0;
-  #pragma unroll
-      for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
-      }
-
-      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
-      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
-    }
-  };
-
-  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
-  #pragma unroll
-    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
-      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
-    }
-
-    wait_for_stage();
-  };
-  #pragma unroll
-  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
-    int n_tile_id = 0;
-
-    if constexpr (has_perm) {
-      load_perm_to_shared(k_tile_id);
-    }
-
-    start_pipes(k_tile_id, n_tile_id);
-
-    while (n_tile_id < n_tiles) {
-  #pragma unroll
-      for (int pipe = 0; pipe < repack_stages; pipe++) {
-        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
-                        n_tile_id + pipe + repack_stages - 1);
-        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
-        wait_for_stage();
-      }
-      n_tile_id += repack_stages;
-    }
-  }
-}
-
-}  // namespace gptq_marlin
-
-  #define CALL_IF(NUM_BITS, HAS_PERM)                                          \
-    else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                   \
-      cudaFuncSetAttribute(                                                    \
-          gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads,       \
-                                            NUM_BITS, HAS_PERM>,               \
-          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
-      gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
-                                        HAS_PERM>                              \
-          <<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>(   \
-              b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);              \
-    }
-
-torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
-                                 int64_t size_k, int64_t size_n,
-                                 int64_t num_bits) {
-  // Verify compatibility with marlin tile of 16x64
-  TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
-              " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
-  TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
-              " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
-
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
-  int const pack_factor = 32 / num_bits;
-
-  // Verify B
-  TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
-              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
-              ", size_k = ", size_k, ", pack_factor = ", pack_factor);
-  TORCH_CHECK(b_q_weight.size(1) == size_n,
-              "b_q_weight.size(1) = ", b_q_weight.size(1),
-              " is not size_n = ", size_n);
-
-  // Verify device and strides
-  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
-  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
-  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
-
-  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
-  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
-  TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
-
-  // Alloc buffers
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  torch::Tensor out =
-      torch::empty({size_k / gptq_marlin::tile_size,
-                    size_n * gptq_marlin::tile_size / pack_factor},
-                   options);
-
-  // Detect if there is act_order
-  bool has_perm = perm.size(0) != 0;
-
-  // Get ptrs
-  uint32_t const* b_q_weight_ptr =
-      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
-  uint32_t const* perm_ptr = reinterpret_cast<uint32_t const*>(perm.data_ptr());
-  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
-
-  // Get dev info
-  int dev = b_q_weight.get_device();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
-  int blocks;
-  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-
-  int max_shared_mem = 0;
-  cudaDeviceGetAttribute(&max_shared_mem,
-                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-  TORCH_CHECK(max_shared_mem > 0);
-
-  if (false) {
-  }
-  CALL_IF(4, false)
-  CALL_IF(4, true)
-  CALL_IF(8, false)
-  CALL_IF(8, true)
-  else {
-    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
-                ", has_perm = ", has_perm);
-  }
-
-  return out;
-}
-
-#endif
diff --git a/server/marlin/marlin_kernels/marlin_cuda_kernel.cu b/server/marlin/marlin_kernels/marlin_cuda_kernel.cu
deleted file mode 100644
index d124c0149912d8530e6f776fdb95dd1e417f90b7..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/marlin_cuda_kernel.cu
+++ /dev/null
@@ -1,1136 +0,0 @@
-/*
- * Modified by Neural Magic
- * Copyright (C) Marlin.2024 Elias Frantar
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <iostream>
-
-template <typename T>
-inline std::string str(T x) {
-  return std::to_string(x);
-}
-
-namespace marlin {
-
-constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-
-// Instances of `Vec` are used to organize groups of >>registers<<, as needed
-// for instance as inputs to tensor core operations. Consequently, all
-// corresponding index accesses must be compile-time constants, which is why we
-// extensively use `#pragma unroll` throughout the kernel code to guarantee
-// this.
-template <typename T, int n>
-struct Vec {
-  T elems[n];
-  __device__ T& operator[](int i) { return elems[i]; }
-};
-
-using I4 = Vec<int, 4>;
-
-// Matrix fragments for tensor core instructions; their precise layout is
-// documented here:
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
-using FragA = Vec<half2, 4>;
-using FragB = Vec<half2, 2>;
-using FragC = Vec<float, 4>;
-using FragS = Vec<half2, 1>;  // quantization scales
-
-// Predicated asynchronous global->shared copy; used for inputs A where we apply
-// predication to handle batchsizes that are not multiples of 16.
-__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
-                                      bool pred = true) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   .reg .pred p;\n"
-      "   setp.ne.b32 p, %0, 0;\n"
-      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-      "}\n" ::"r"((int)pred),
-      "r"(smem), "l"(glob_ptr), "n"(BYTES));
-}
-
-// Asynchronous global->shared copy
-__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
-}
-
-// Async copy fence.
-__device__ inline void cp_async_fence() {
-  asm volatile("cp.async.commit_group;\n" ::);
-}
-
-// Wait until at most `n` async copy stages are still pending.
-template <int n>
-__device__ inline void cp_async_wait() {
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
-}
-
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
-                           FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-        "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-}
-
-// Instruction for loading a full 16x16 matrix fragment of operand A from shared
-// memory, directly in tensor core layout.
-__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
-  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
-               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
-               : "r"(smem));
-}
-
-// Lookup-table based 3-input logical operation; explicitly used for
-// dequantization as the compiler does not seem to automatically recognize it in
-// all cases.
-template <int lut>
-__device__ inline int lop3(int a, int b, int c) {
-  int res;
-  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-               : "=r"(res)
-               : "r"(a), "r"(b), "r"(c), "n"(lut));
-  return res;
-}
-
-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
-// values. We mostly follow the strategy in the link below, with some small
-// changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__device__ inline FragB dequant(int q) {
-  const int LO = 0x000f000f;
-  const int HI = 0x00f000f0;
-  const int EX = 0x64006400;
-  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
-  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
-  // directly into `SUB` and `ADD`.
-  const int SUB = 0x64086408;
-  const int MUL = 0x2c002c00;
-  const int ADD = 0xd480d480;
-  FragB frag_b;
-  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
-                      *reinterpret_cast<const half2*>(&SUB));
-  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
-                      *reinterpret_cast<const half2*>(&MUL),
-                      *reinterpret_cast<const half2*>(&ADD));
-  return frag_b;
-}
-
-// Multiply dequantized values by the corresponding quantization scale; used
-// only for grouped quantization.
-__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
-  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
-  frag_b[0] = __hmul2(frag_b[0], s);
-  frag_b[1] = __hmul2(frag_b[1], s);
-}
-
-// Wait until barrier reaches `count`, then lock for current threadblock.
-__device__ inline void barrier_acquire(int* lock, int count) {
-  if (threadIdx.x == 0) {
-    int state = -1;
-    do
-      // Guarantee that subsequent writes by this threadblock will be visible
-      // globally.
-      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
-                   : "=r"(state)
-                   : "l"(lock));
-    while (state != count);
-  }
-  __syncthreads();
-}
-
-// Release barrier and increment visitation count.
-__device__ inline void barrier_release(int* lock, bool reset = false) {
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    if (reset) {
-      lock[0] = 0;
-      return;
-    }
-    int val = 1;
-    // Make sure that all writes since acquiring this barrier are visible
-    // globally, while releasing the barrier.
-    asm volatile("fence.acq_rel.gpu;\n");
-    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
-                 :
-                 : "l"(lock), "r"(val));
-  }
-}
-
-template <const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    const int4* __restrict__ s,  // fp16 quantization scales of shape
-                                 // (k/groupsize)xn
-    int prob_m,                  // batch dimension m
-    int prob_n,                  // output dimension n
-    int prob_k,                  // reduction dimension k
-    int* locks  // extra global storage for barrier synchronization
-) {
-  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
-  // same size, which might involve multiple column "slices" (of width 16 *
-  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
-  // example:
-  //   0 1 3
-  //   0 2 3
-  //   1 2 4
-  // While this kind of partitioning makes things somewhat more complicated, it
-  // ensures good utilization of all SMs for many kinds of shape and GPU
-  // configurations, while requiring as few slow global cross-threadblock
-  // reductions as possible.
-
-  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
-  // better partitioning with less reductions
-  int parallel = 1;
-  if (prob_m > 16 * thread_m_blocks) {
-    parallel = prob_m / (16 * thread_m_blocks);
-    prob_m = 16 * thread_m_blocks;
-  }
-
-  int k_tiles = prob_k / 16 / thread_k_blocks;
-  int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
-  // Ensure that the number of tiles in each stripe is a multiple of the
-  // groupsize; this avoids an annoying special case where a stripe starts in
-  // the middle of group.
-  if (group_blocks != -1)
-    iters = (group_blocks / thread_k_blocks) *
-            ceildiv(iters, (group_blocks / thread_k_blocks));
-
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
-
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
-    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
-    locks += (slice_col_par / n_tiles) * n_tiles;
-    slice_col = slice_col_par % n_tiles;
-  }
-
-  // Compute all information about the current slice which is required for
-  // synchronization.
-  auto init_slice = [&]() {
-    slice_iters =
-        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
-    if (slice_iters == 0) return;
-    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
-    slice_count = 1;
-    slice_idx = 0;
-    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
-    if (col_first <= k_tiles * (slice_col_par + 1)) {
-      int col_off = col_first - k_tiles * slice_col_par;
-      slice_count = ceildiv(k_tiles - col_off, iters);
-      if (col_off > 0) slice_count++;
-      int delta_first = iters * blockIdx.x - col_first;
-      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
-        slice_idx = slice_count - 1;
-      else {
-        slice_idx = slice_count - 1 - delta_first / iters;
-        if (col_off > 0) slice_idx--;
-      }
-    }
-    if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 8;
-      C += 16 * thread_m_blocks * prob_n / 8;
-      locks += n_tiles;
-      slice_col = 0;
-    }
-  };
-  init_slice();
-
-  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
-  // We typically use `constexpr` to indicate that this value is a compile-time
-  // constant
-  constexpr int a_sh_stride =
-      16 * thread_k_blocks / 8;  // stride of an A matrix tile in shared memory
-  constexpr int a_gl_rd_delta_o =
-      16 * thread_k_blocks /
-      8;  // delta between subsequent A tiles in global memory
-  int a_gl_rd_delta_i =
-      a_gl_stride *
-      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
-  constexpr int a_sh_wr_delta =
-      a_sh_stride *
-      (threads / a_gl_rd_delta_o);  // between shared memory writes
-  constexpr int a_sh_rd_delta_o =
-      2 * ((threads / 32) /
-           (thread_n_blocks / 4));  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_i =
-      a_sh_stride * 16;  // within a shared memory tile
-  constexpr int a_sh_stage =
-      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
-  constexpr int a_sh_wr_iters =
-      ceildiv(a_sh_stage,
-              a_sh_wr_delta);  // number of shared write iterations for a tile
-
-  int b_gl_stride = 16 * prob_n / 32;
-  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
-  constexpr int b_sh_wr_delta = threads;
-  constexpr int b_sh_rd_delta = threads;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
-  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
-
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
-  constexpr int s_sh_stage = s_sh_stride;
-  int s_gl_rd_delta = s_gl_stride;
-
-  // Global A read index of current thread.
-  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  a_gl_rd += a_gl_rd_delta_o * slice_row;
-  // Shared write index of current thread.
-  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  // Shared read index.
-  int a_sh_rd =
-      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
-
-  int b_gl_rd =
-      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
-  b_gl_rd += b_sh_stride * slice_col;
-  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
-
-  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
-                s_sh_stride * slice_col + threadIdx.x;
-  int s_sh_wr = threadIdx.x;
-  int s_sh_rd;
-  // We use a different scale layout for grouped and column-wise quantization as
-  // we scale a `half2` tile in column-major layout in the former and in
-  // row-major in the latter case.
-  if (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
-
-  // Precompute which thread should not read memory in which iterations; this is
-  // needed if there are more threads than required for a certain tilesize or
-  // when the batchsize is not a multiple of 16.
-  bool a_sh_wr_pred[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
-
-  // To ensure that writing and reading A tiles to/from shared memory, the
-  // latter in fragment format, is fully bank conflict free, we need to use a
-  // rather fancy XOR-based layout. The key here is that neither reads nor
-  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
-  // same shared memory banks. Further, it seems (based on NSight-Compute) that
-  // each warp must also write a consecutive memory segment?
-  auto transform_a = [&](int i) {
-    int row = i / a_gl_rd_delta_o;
-    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
-  };
-  // Since the computation of this remapping is non-trivial and, due to our main
-  // loop unrolls, all shared memory accesses are static, we simply precompute
-  // both transformed reads and writes.
-  int a_sh_wr_trans[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
-  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-    for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
-  }
-
-  // Since B-accesses have non-constant stride they have to be computed at
-  // runtime; we break dependencies between subsequent accesses with a tile by
-  // maintining multiple pointers (we have enough registers), a tiny
-  // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
-
-  extern __shared__ int4 sh[];
-  // Shared memory storage for global fetch pipelines.
-  int4* sh_a = sh;
-  int4* sh_b = sh_a + (stages * a_sh_stage);
-  int4* sh_s = sh_b + (stages * b_sh_stage);
-  // Register storage for double buffer of shared memory reads.
-  FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2];
-  FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];
-
-  // Zero accumulators.
-  auto zero_accums = [&]() {
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
-      reinterpret_cast<float*>(frag_c)[i] = 0;
-  };
-
-  // Asynchronously fetch the next A, B and s tile from global to the next
-  // shared memory pipeline location.
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
-    if (pred) {
-      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < a_sh_wr_iters; i++) {
-        cp_async4_pred(
-            &sh_a_stage[a_sh_wr_trans[i]],
-            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
-            a_sh_wr_pred[i]);
-      }
-      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
-        B_ptr[i] += b_gl_rd_delta_o;
-      }
-      // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
-      }
-    }
-    // Insert a fence even when we are winding down the pipeline to ensure that
-    // waiting is also correct at this point.
-    cp_async_fence();
-  };
-
-  // Wait until the next thread tile has been loaded to shared memory.
-  auto wait_for_stage = [&]() {
-    // We only have `stages - 2` active fetches since we are double buffering
-    // and can only issue the next fetch when it is guaranteed that the previous
-    // shared memory load is fully complete (as it may otherwise be
-    // overwritten).
-    cp_async_wait<stages - 2>();
-    __syncthreads();
-  };
-
-  // Load the next sub-tile from the current location in the shared memory pipe
-  // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe) {
-    // It may seem inefficient that we reload the groups for every sub-tile;
-    // however, this does not seem to be a significant bottleneck, while some
-    // theoretically better attempts have lead to bad instruction ordering by
-    // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
-      int4* sh_s_stage =
-          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
-      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-    }
-    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks; i++)
-      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
-    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
-        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
-  };
-
-  // Execute the actual tensor core matmul of a sub-tile.
-  auto matmul = [&](int k) {
-  // We have the m dimension as the inner loop in order to encourage overlapping
-  // dequantization and matmul operations.
-  #pragma unroll
-    for (int j = 0; j < 4; j++) {
-      int b_quant = frag_b_quant[k % 2][j];
-      int b_quant_shift = b_quant >> 8;
-      FragB frag_b0 = dequant(b_quant);
-      // If there are no groups, we can just scale the final output once and can
-      // avoid doing so for each weight.
-      if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0);
-      FragB frag_b1 = dequant(b_quant_shift);
-      if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1);
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
-        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
-      }
-    }
-  };
-
-  // Since we slice across the k dimension of a tile in order to increase the
-  // number of warps while keeping the n dimension of a tile reasonable, we have
-  // multiple warps that accumulate their partial sums of the same output
-  // location; which we have to reduce over in the end. We do in shared memory.
-  auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride / 2;
-    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
-      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
-                      (threadIdx.x % b_sh_stride);
-
-      // Parallel logarithmic shared memory reduction. We make sure to avoid any
-      // unnecessary read or write iterations, e.g., for two warps we write only
-      // once by warp 1 and read only once by warp 0.
-
-  #pragma unroll
-      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
-  #pragma unroll
-        for (int i = red_off; i > 0; i /= 2) {
-          if (i <= red_idx && red_idx < 2 * i) {
-  #pragma unroll
-            for (int j = 0; j < 4 * 2; j++) {
-              int red_sh_wr =
-                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
-              if (i < red_off) {
-                float* c_rd =
-                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
-                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
-  #pragma unroll
-                for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
-                      c_rd[k] + c_wr[k];
-              }
-              sh[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
-            }
-          }
-          __syncthreads();
-        }
-        if (red_idx == 0) {
-  #pragma unroll
-          for (int i = 0; i < 4 * 2; i++) {
-            float* c_rd =
-                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
-  #pragma unroll
-            for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
-          }
-        }
-        __syncthreads();
-      }
-    }
-  };
-
-  // Since multiple threadblocks may process parts of the same column slice, we
-  // finally have to globally reduce over the results. As the striped
-  // partitioning minimizes the number of such reductions and our outputs are
-  // usually rather small, we perform this reduction serially in L2 cache.
-  auto global_reduce = [&](bool first = false, bool last = false) {
-    // We are very careful here to reduce directly in the output buffer to
-    // maximize L2 cache utilization in this step. To do this, we write out
-    // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
-    if (threadIdx.x < active_threads) {
-      int c_gl_stride = prob_n / 8;
-      int c_gl_wr_delta_o = 8 * c_gl_stride;
-      int c_gl_wr_delta_i = 4 * (active_threads / 32);
-      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
-                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
-      c_gl_wr += (2 * thread_n_blocks) * slice_col;
-      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
-
-      int row = (threadIdx.x % 32) / 4;
-
-      if (!first) {
-  // Interestingly, doing direct global accesses here really seems to mess up
-  // the compiler and lead to slowdowns, hence we also use async-copies even
-  // though these fetches are not actually asynchronous.
-  #pragma unroll
-        for (int i = 0; i < thread_m_blocks * 4; i++) {
-          cp_async4_pred(
-              &sh[c_sh_wr + c_sh_wr_delta * i],
-              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                 c_gl_wr_delta_i * (i % 2)],
-              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
-        }
-        cp_async_fence();
-        cp_async_wait<0>();
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks * 4; i++) {
-        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
-          if (!first) {
-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
-  #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
-              reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
-                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
-            }
-          }
-          if (!last) {
-            int4 c;
-  #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
-              reinterpret_cast<__half*>(&c)[j] =
-                  __float2half(reinterpret_cast<float*>(
-                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
-            }
-            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
-                c;
-          }
-        }
-      }
-    }
-  };
-
-  // Write out the reduce final result in the correct layout. We only actually
-  // reshuffle matrix fragments in this step, the reduction above is performed
-  // in fragment layout.
-  auto write_result = [&]() {
-    int c_gl_stride = prob_n / 8;
-    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
-    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
-    constexpr int c_sh_rd_delta =
-        c_sh_stride * (threads / (2 * thread_n_blocks));
-
-    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-    c_gl_wr += (2 * thread_n_blocks) * slice_col;
-    int c_sh_wr =
-        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-    c_sh_wr += 32 * (threadIdx.x / 32);
-    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-
-    int c_gl_wr_end = c_gl_stride * prob_m;
-
-    // We first reorder in shared memory to guarantee the most efficient final
-    // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
-      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
-      if (group_blocks ==
-          -1)  // for per-column quantization we finally apply the scale here
-        res = __hmul2(res, s[0]);
-      ((half2*)sh)[idx] = res;
-    };
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-  #pragma unroll
-        for (int j = 0; j < 4; j++) {
-          int wr = c_sh_wr + 8 * j;
-          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
-          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
-        }
-        c_sh_wr += 16 * (4 * c_sh_stride);
-      }
-    }
-    __syncthreads();
-
-  #pragma unroll
-    for (int i = 0;
-         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
-         i++) {
-      if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh[c_sh_rd];
-        c_gl_wr += c_gl_wr_delta;
-        c_sh_rd += c_sh_rd_delta;
-      }
-    }
-  };
-
-  // Start global fetch and register load pipelines.
-  auto start_pipes = [&]() {
-  #pragma unroll
-    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
-    zero_accums();
-    wait_for_stage();
-    fetch_to_registers(0, 0);
-    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
-  };
-  start_pipes();
-
-  // Main loop.
-  while (slice_iters) {
-  // We unroll over both the global fetch and the register load pipeline to
-  // ensure all shared memory accesses are static. Note that both pipelines have
-  // even length meaning that the next iteration will always start at index 0.
-  #pragma unroll
-    for (int pipe = 0; pipe < stages;) {
-  #pragma unroll
-      for (int k = 0; k < b_sh_wr_iters; k++) {
-        fetch_to_registers(k + 1, pipe % stages);
-        if (k == b_sh_wr_iters - 2) {
-          fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                          slice_iters >= stages);
-          pipe++;
-          wait_for_stage();
-        }
-        matmul(k);
-      }
-      slice_iters--;
-      if (slice_iters == 0) break;
-    }
-    a_gl_rd += a_gl_rd_delta_o * stages;
-
-    // Process results and, if necessary, proceed to the next column slice.
-    // While this pattern may not be the most readable, other ways of writing
-    // the loop seemed to noticeably worse performance after compilation.
-    if (slice_iters == 0) {
-      cp_async_wait<0>();
-      bool last = slice_idx == slice_count - 1;
-      // For per-column scales, we only fetch them here in the final step before
-      // write-out
-      if (group_blocks == -1 && last) {
-        if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
-        cp_async_fence();
-      }
-      thread_block_reduce();
-      if (group_blocks == -1 && last) {
-        cp_async_wait<0>();
-        __syncthreads();
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
-          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
-          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
-        }
-      }
-      if (slice_count > 1) {  // only globally reduce if there is more than one
-                              // block in a slice
-        barrier_acquire(&locks[slice_col], slice_idx);
-        global_reduce(slice_idx == 0, last);
-        barrier_release(&locks[slice_col], last);
-      }
-      if (last)  // only the last block in a slice actually writes the result
-        write_result();
-      slice_row = 0;
-      slice_col_par++;
-      slice_col++;
-      init_slice();
-      if (slice_iters) {
-        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                  (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
-        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-        start_pipes();
-      }
-    }
-  }
-}
-
-#else
-
-template <const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    const int4* __restrict__ s,  // fp16 quantization scales of shape
-                                 // (k/groupsize)xn
-    int prob_m,                  // batch dimension m
-    int prob_n,                  // output dimension n
-    int prob_k,                  // reduction dimension k
-    int* locks  // extra global storage for barrier synchronization
-) {
-  // Marlin is not implemented yet for SM < 8.0
-  assert(false);
-  return;
-}
-
-#endif
-
-// 8 warps are a good choice since every SM has 4 schedulers and having more
-// than 1 warp per schedule allows some more latency hiding. At the same time,
-// we want relatively few warps to have many registers per warp and small tiles.
-const int USER_THREADS =
-    256;               // Note: This is only used with user-provided thread_k/n
-const int STAGES = 4;  // 4 pipeline stages fit into shared memory
-const int SHARED_MEM =
-    96 * 1024;  // max shared memory on compute capability 8.6 (< 8.0)
-
-static constexpr int min_thread_n = 64;
-static constexpr int min_thread_k = 64;
-
-static constexpr int tile_size = 16;
-static constexpr int max_par = 16;
-
-static constexpr int pack_factor_4bit =
-    8;  // We have 8 4-bit vals inside a 32 bit
-
-#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
-                  GROUP_BLOCKS, NUM_THREADS)                                   \
-  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
-           thread_n_blocks == THREAD_N_BLOCKS &&                               \
-           thread_k_blocks == THREAD_K_BLOCKS &&                               \
-           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
-    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
-                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
-                         SHARED_MEM);                                          \
-    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
-           STAGES, GROUP_BLOCKS><<<blocks, NUM_THREADS, SHARED_MEM, stream>>>( \
-        A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks);            \
-  }
-
-typedef struct {
-  int thread_k;
-  int thread_n;
-  int num_threads;
-} thread_config_t;
-
-thread_config_t small_batch_thread_configs[] = {
-    // Ordered by priority
-
-    // thread_k, thread_n, num_threads
-    {128, 128, 256},  // Default
-    {128, 64, 128},   // Reduce N 2X, same K
-    {64, 256, 256},   // Reduce K 2X, increase N 2X
-    {64, 128, 128},   // Reduce K 2X, same N
-};
-
-thread_config_t large_batch_thread_configs[] = {
-    // Ordered by priority
-
-    // thread_k, thread_n, num_threads
-    {64, 256, 256},   // Default
-    {128, 128, 256},  // Reduce N 2X, increase K 2X
-    {64, 128, 128},   // Reduce N 2X, same K
-    {128, 64, 128},   // Reduce N 4X, increase K 2X
-};
-
-bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
-                     int prob_k) {
-  // Sanity
-  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
-      th_config.num_threads == -1) {
-    return false;
-  }
-
-  // Verify K/N are divisible by thread K/N
-  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
-    return false;
-  }
-
-  // thread_k can be only 128 or 64 (because it must be less than groupsize
-  // which is 128)
-  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
-    return false;
-  }
-
-  // Verify min for thread K/N
-  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
-    return false;
-  }
-
-  // num_threads must be at least 128 (= 4 warps)
-  if (th_config.num_threads < 128) {
-    return false;
-  }
-
-  return true;
-}
-
-thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
-  if (prob_m <= 16) {
-    for (auto th_config : small_batch_thread_configs) {
-      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-        return th_config;
-      }
-    }
-
-  } else {
-    for (auto th_config : large_batch_thread_configs) {
-      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-        return th_config;
-      }
-    }
-  }
-
-  return thread_config_t{-1, -1, -1};
-}
-
-#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
-
-void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m,
-                 int prob_n, int prob_k, void* workspace, int groupsize = -1,
-                 int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
-                 int thread_n = -1, int sms = -1, int max_par = 16) {
-  int tot_m = prob_m;
-  int tot_m_blocks = ceildiv(tot_m, 16);
-  int pad = 16 * tot_m_blocks - tot_m;
-
-  if (sms == -1)
-    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
-
-  // Set thread config
-  thread_config_t th_config;
-  if (thread_k != -1 && thread_n != -1) {
-    // User-defined config
-    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
-  } else {
-    // Auto config
-    th_config = determine_thread_config(prob_m, prob_n, prob_k);
-  }
-
-  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-    throw std::runtime_error(
-        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
-        ", thread_n = " + str(th_config.thread_n) +
-        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
-        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
-  }
-
-  // Uncomment for debug
-  // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) +
-  //                  ", thread_n = " + str(th_config.thread_n) +
-  //                  ", num_threads = " + str(th_config.num_threads) + " for
-  //                  MKN = [" + str(prob_m) +
-  //                  ", " + str(prob_k) + ", " + str(prob_n) + "]\n";
-
-  int num_threads = th_config.num_threads;
-  thread_k = th_config.thread_k;
-  thread_n = th_config.thread_n;
-
-  int thread_k_blocks = thread_k / 16;
-  int thread_n_blocks = thread_n / 16;
-  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
-  int blocks = sms;
-
-  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
-    return;
-  }
-
-  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
-              " is not divisible by thread_n = ", thread_n);
-  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
-              " is not divisible by thread_k = ", thread_k);
-  if (group_blocks != -1) {
-    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
-                " is not divisible by group_blocks = ", group_blocks);
-  }
-
-  const int4* A_ptr = (const int4*)A;
-  const int4* B_ptr = (const int4*)B;
-  int4* C_ptr = (int4*)C;
-  const int4* s_ptr = (const int4*)s;
-
-  int* locks = (int*)workspace;
-
-  for (int i = 0; i < tot_m_blocks; i += 4) {
-    int thread_m_blocks = tot_m_blocks - i;
-    prob_m = tot_m - 16 * i;
-    int par = 1;
-    if (thread_m_blocks > 4) {
-      // Note that parallel > 1 currently only works for inputs without any
-      // padding
-      par = (16 * thread_m_blocks - pad) / 64;
-      if (par > max_par) par = max_par;
-      prob_m = 64 * par;
-      i += 4 * (par - 1);
-      thread_m_blocks = 4;
-    }
-
-    // For compilation speed, we only define the kernel configurations that have
-    // seemed useful (in terms of performance) in our testing, however many more
-    // are, in principle, possible.
-    if (false) {
-    }
-    CALL_IF(8, 8, 256)
-    CALL_IF(16, 4, 256)
-    CALL_IF(8, 4, 128)
-    CALL_IF(4, 8, 128)
-    else {
-      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
-                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
-                               ", groupsize = " + str(groupsize) +
-                               ", thread_m_blocks = " + str(thread_m_blocks) +
-                               ", thread_n_blocks = " + str(thread_n_blocks) +
-                               ", thread_k_blocks = " + str(thread_k_blocks));
-    }
-
-    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
-    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
-  }
-}
-
-}  // namespace marlin
-
-torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                          torch::Tensor& b_scales, torch::Tensor& workspace,
-                          int64_t size_m, int64_t size_n, int64_t size_k) {
-  // Verify M
-  TORCH_CHECK(size_m == a.size(0),
-              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
-                  ", size_m = " + str(size_m));
-
-  // Verify K
-  TORCH_CHECK(size_k == a.size(1),
-              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
-                  ", size_k = " + str(size_k));
-  TORCH_CHECK(size_k % marlin::tile_size == 0,
-              "size_k = " + str(size_k) +
-                  " is not divisible by tile_size = " + str(marlin::tile_size));
-  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
-              "Shape mismatch: b_q_weight.size(0) = " +
-                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
-                  ", tile_size = " + str(marlin::tile_size));
-
-  // Verify N
-  TORCH_CHECK(b_scales.size(1) == size_n,
-              "b_scales.size(1) = " + str(b_scales.size(1)) +
-                  ", size_n = " + str(size_n));
-  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
-              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
-                  " is not divisible by tile_size = " + str(marlin::tile_size));
-
-  int actual_size_n =
-      (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit;
-  TORCH_CHECK(
-      size_n == actual_size_n,
-      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
-
-  // Verify A device and strides
-  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
-
-  // Verify B device and strides
-  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
-  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
-
-  // Verify scales device and strides
-  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
-  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
-
-  // Alloc C matrix
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  torch::Tensor c = torch::empty({size_m, size_n}, options);
-
-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_k = -1;
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_n = -1;
-  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
-  int sms = -1;
-
-  // Detect groupsize
-  if (b_scales.size(0) != 1) {
-    TORCH_CHECK(size_k % b_scales.size(0) == 0,
-                "size_k = " + str(size_k) +
-                    ", is not divisible by b_scales.size(0) = " +
-                    str(b_scales.size(0)));
-  }
-  int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0);
-
-  // Verify groupsize
-  TORCH_CHECK(groupsize == -1 || groupsize == 128,
-              "Unexpected groupsize = " + str(groupsize));
-
-  // Verify workspace size
-  TORCH_CHECK(
-      size_n % marlin::min_thread_n == 0,
-      "size_n = " + str(size_n) +
-          ", is not divisible by min_thread_n = " + str(marlin::min_thread_n));
-  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
-  TORCH_CHECK(workspace.numel() >= min_workspace_size,
-              "workspace.numel = " + str(workspace.numel()) +
-                  " is below min_workspace_size = " + str(min_workspace_size));
-
-  int dev = a.get_device();
-  marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
-                      b_scales.data_ptr(), size_m, size_n, size_k,
-                      workspace.data_ptr(), groupsize, dev,
-                      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
-                      sms, marlin::max_par);
-
-  return c;
-}
diff --git a/server/marlin/marlin_kernels/sparse/common/base.h b/server/marlin/marlin_kernels/sparse/common/base.h
deleted file mode 100644
index 16018d331bec23617f3ac041f098ffe1cc89e027..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/sparse/common/base.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
- * Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace marlin_24 {
-
-constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
-
-// Instances of `Vec` are used to organize groups of >>registers<<, as needed
-// for instance as inputs to tensor core operations. Consequently, all
-// corresponding index accesses must be compile-time constants, which is why we
-// extensively use `#pragma unroll` throughout the kernel code to guarantee
-// this.
-template <typename T, int n>
-struct Vec {
-  T elems[n];
-  __device__ T& operator[](int i) { return elems[i]; }
-};
-
-template <int M_, int N_, int K_>
-struct ShapeBase {
-  static constexpr int M = M_, N = N_, K = K_;
-};
-
-using I4 = Vec<int, 4>;
-
-// Matrix fragments for tensor core instructions; their precise layout is
-// documented here:
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
-using FragA = Vec<half2, 4>;
-using FragB = Vec<half2, 2>;
-using FragM = Vec<uint, 1>;
-using FragC = Vec<float, 4>;
-using FragS = Vec<half2, 1>;  // quantization scales
-
-}  // namespace marlin_24
diff --git a/server/marlin/marlin_kernels/sparse/common/mem.h b/server/marlin/marlin_kernels/sparse/common/mem.h
deleted file mode 100644
index 83e3578d2f5116e3612433d759553d7c8ddf0e7c..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/sparse/common/mem.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
- * Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "base.h"
-
-namespace marlin_24 {
-// Predicated asynchronous global->shared copy; used for inputs A where we apply
-// predication to handle batchsizes that are not multiples of 16.
-__device__ inline void cp_async4_pred_zfill(void* smem_ptr,
-                                            const void* glob_ptr,
-                                            bool pred = true,
-                                            const bool zfill = false) {
-  const int BYTES = 16;
-  int src_in_bytes = (zfill ? 0 : BYTES);
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   .reg .pred p;\n"
-      "   setp.ne.b32 p, %0, 0;\n"
-      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-      "}\n" ::"r"((int)pred),
-      "r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes));
-}
-
-__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
-                                      bool pred = true) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   .reg .pred p;\n"
-      "   setp.ne.b32 p, %0, 0;\n"
-      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-      "}\n" ::"r"((int)pred),
-      "r"(smem), "l"(glob_ptr), "n"(BYTES));
-}
-
-// Asynchronous global->shared copy
-__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
-}
-
-// Async copy fence.
-__device__ inline void cp_async_fence() {
-  asm volatile("cp.async.commit_group;\n" ::);
-}
-
-// Wait until at most `n` async copy stages are still pending.
-template <int n>
-__device__ inline void cp_async_wait() {
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
-}
-
-// Instruction for loading a full 16x16 matrix fragment of operand A from shared
-// memory, directly in tensor core layout.
-__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
-  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
-               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
-               : "r"(smem));
-}
-
-__device__ inline void ldsm4_m(FragM& frag_m, const void* smem_ptr) {
-  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_m);
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
-               : "=r"(a[0]), "=r"(a[1])
-               : "r"(smem));
-}
-
-// Instruction for loading a full 16x16 matrix fragment of operand A from shared
-// memory, directly in tensor core layout.
-__device__ inline void ldsm4_t(FragA& frag_a, const void* smem_ptr) {
-  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
-      : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
-      : "r"(smem));
-}
-
-// Wait until barrier reaches `count`, then lock for current threadblock.
-__device__ inline void barrier_acquire(int* lock, int count) {
-  if (threadIdx.x == 0) {
-    int state = -1;
-    do
-      // Guarantee that subsequent writes by this threadblock will be visible
-      // globally.
-      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
-                   : "=r"(state)
-                   : "l"(lock));
-    while (state != count);
-  }
-  __syncthreads();
-}
-
-// Release barrier and increment visitation count.
-__device__ inline void barrier_release(int* lock, bool reset = false) {
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    if (reset) {
-      lock[0] = 0;
-      return;
-    }
-    int val = 1;
-    // Make sure that all writes since acquiring this barrier are visible
-    // globally, while releasing the barrier.
-    asm volatile("fence.acq_rel.gpu;\n");
-    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
-                 :
-                 : "l"(lock), "r"(val));
-  }
-}
-}  // namespace marlin_24
diff --git a/server/marlin/marlin_kernels/sparse/common/mma.h b/server/marlin/marlin_kernels/sparse/common/mma.h
deleted file mode 100644
index b26505f771c8b1ee5eb8dbbf736a7d2a5d0243ec..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/sparse/common/mma.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
- * Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "base.h"
-#include <cudaTypedefs.h>
-
-namespace marlin_24 {
-
-// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
-// is not supported. On later versions of CUDA the version without ordered
-// metadata results in the following warning:
-//  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
-//  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
-//  | reduced performance on some future architectures
-#if defined CUDA_VERSION && CUDA_VERSION >= 12050
-  #define MMA_SP_INST \
-    "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
-#else
-  #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
-#endif
-
-// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
-                              const FragA& frag_b, FragC& frag_c, FragM& frag_m,
-                              const int psel) {
-  const uint32_t* a0 = reinterpret_cast<const uint32_t*>(&a_frag0);
-  const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
-
-  float* c = reinterpret_cast<float*>(&frag_c);
-  if (psel == 0) {
-    asm volatile(MMA_SP_INST
-                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-                 "{%12,%13,%14,%15}, %16, 0x0;\n"
-                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
-                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
-                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
-    asm volatile(MMA_SP_INST
-                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-                 "{%12,%13,%14,%15}, %16, 0x0;\n"
-                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
-                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
-                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
-                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
-  } else {
-    asm volatile(MMA_SP_INST
-                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-                 "{%12,%13,%14,%15}, %16, 0x1;\n"
-                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
-                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
-                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
-    asm volatile(MMA_SP_INST
-                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-                 "{%12,%13,%14,%15}, %16, 0x1;\n"
-                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
-                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
-                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
-                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
-  }
-}
-
-// Lookup-table based 3-input logical operation; explicitly used for
-// dequantization as the compiler does not seem to automatically recognize it in
-// all cases.
-template <int lut>
-__device__ inline int lop3(int a, int b, int c) {
-  int res;
-  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-               : "=r"(res)
-               : "r"(a), "r"(b), "r"(c), "n"(lut));
-  return res;
-}
-
-__device__ __forceinline__ uint2 to_half4(float c0, float c1, float c2,
-                                          float c3) {
-  uint2 r;
-  asm("{\n\t"
-      ".reg .f16 a, b, c, d; \n\t"
-      "cvt.rn.f16.f32 a, %2; \n\t"
-      "cvt.rn.f16.f32 b, %3; \n\t"
-      "cvt.rn.f16.f32 c, %4; \n\t"
-      "cvt.rn.f16.f32 d, %5; \n\t"
-      "mov.b32 %0, {a, b};   \n\t"
-      "mov.b32 %1, {c, d};   \n\t"
-      "}"
-      : "=r"(r.x), "=r"(r.y)
-      : "f"(c0), "f"(c1), "f"(c2), "f"(c3));
-  return r;
-}
-
-// Constructs destination register by taking bytes from 2 sources (based on
-// mask)
-template <int start_byte, int mask>
-__device__ inline uint32_t prmt(uint32_t a) {
-  uint32_t res;
-  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
-               : "=r"(res)
-               : "r"(a), "n"(start_byte), "n"(mask));
-  return res;
-}
-
-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
-// values. We mostly follow the strategy in the link below, with some small
-// changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__device__ inline FragB dequant_4bit(int q) {
-  const int LO = 0x000f000f;
-  const int HI = 0x00f000f0;
-  const int EX = 0x64006400;
-  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
-  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
-  // directly into `SUB` and `ADD`.
-  const int SUB = 0x64086408;
-  const int MUL = 0x2c002c00;
-  const int ADD = 0xd480d480;
-
-  FragB frag_b;
-  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
-                      *reinterpret_cast<const half2*>(&SUB));
-  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
-                      *reinterpret_cast<const half2*>(&MUL),
-                      *reinterpret_cast<const half2*>(&ADD));
-  return frag_b;
-}
-
-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
-// values. We mostly follow the strategy in the link below, with some small
-// changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__device__ inline FragB dequant_8bit(int q) {
-  static constexpr uint32_t mask_for_elt_01 = 0x5250;
-  static constexpr uint32_t mask_for_elt_23 = 0x5351;
-  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
-
-  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
-  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
-
-  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
-
-  FragB frag_b;
-  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
-                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
-  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
-                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
-  return frag_b;
-}
-
-// Multiply dequantized values by the corresponding quantization scale; used
-// only for grouped quantization.
-__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
-  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
-  frag_b[0] = __hmul2(frag_b[0], s);
-  frag_b[1] = __hmul2(frag_b[1], s);
-}
-
-__device__ inline void scale_floats(float* c0, float* c1, float* c2, float* c3,
-                                    FragS& s0, float* c4, float* c5, float* c6,
-                                    float* c7, FragS& s1) {
-  *c0 = __fmul_rn(*c0, __half2float(s0[0].x));
-  *c1 = __fmul_rn(*c1, __half2float(s0[0].y));
-  *c2 = __fmul_rn(*c2, __half2float(s0[1].x));
-  *c3 = __fmul_rn(*c3, __half2float(s0[1].y));
-
-  *c4 = __fmul_rn(*c4, __half2float(s1[0].x));
-  *c5 = __fmul_rn(*c5, __half2float(s1[0].y));
-  *c6 = __fmul_rn(*c6, __half2float(s1[1].x));
-  *c7 = __fmul_rn(*c7, __half2float(s1[1].y));
-}
-
-}  // namespace marlin_24
diff --git a/server/marlin/marlin_kernels/sparse/marlin_24_cuda_kernel.cu b/server/marlin/marlin_kernels/sparse/marlin_24_cuda_kernel.cu
deleted file mode 100644
index b5effc3055441e4efc53a9d7d0d7cf9011c8fa51..0000000000000000000000000000000000000000
--- a/server/marlin/marlin_kernels/sparse/marlin_24_cuda_kernel.cu
+++ /dev/null
@@ -1,1125 +0,0 @@
-/*
- * Notice: This file was modified by Neuralmagic inc to include 8-bit support
- *
- * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
- * Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <iostream>
-
-#include "common/base.h"
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-#else
-
-  #include "common/mem.h"
-  #include "common/mma.h"
-
-#endif
-
-template <typename T>
-inline std::string str(T x) {
-  return std::to_string(x);
-}
-
-namespace marlin_24 {
-
-// 8 warps are a good choice since every SM has 4 schedulers and having more
-// than 1 warp per schedule allows some more latency hiding. At the same time,
-// we want relatively few warps to have many registers per warp and small tiles.
-static constexpr int THREADS = 256;
-static constexpr int STAGES = 4;
-
-static constexpr int min_thread_n = 128;
-
-static constexpr int tile_size = 16;
-static constexpr int max_par = 64;
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-template <const int num_bits,         // weight bits
-          const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin_24(
-    const int4* __restrict__ A,     // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,     // 4bit quantized weight matrix of shape kxn
-    const int4* __restrict__ meta,  // 2bit metadata information about 2:4
-                                    // format on B
-    int4* __restrict__ C,           // fp16 output buffer of shape mxn
-    const int4* __restrict__ s,     // fp16 quantization scales of shape
-                                    // (k/groupsize)xn
-    int prob_m,                     // batch dimension m
-    int prob_n,                     // output dimension n
-    int prob_k,                     // reduction dimension k
-    int* locks  // extra global storage for barrier synchronization
-) {}
-
-torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                                  torch::Tensor& b_meta,
-                                  torch::Tensor& b_scales,
-                                  torch::Tensor& workspace, int64_t num_bits,
-                                  int64_t size_m, int64_t size_n,
-                                  int64_t size_k) {
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false, "gptq_marlin_24_gemm(..) requires CUDA_ARCH >= 8.0");
-  return torch::empty({1, 1});
-}
-
-#else
-
-template <const int num_bits,         // weight bits
-          const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin_24(
-    const int4* __restrict__ A,     // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,     // 4bit quantized weight matrix of shape kxn
-    const int4* __restrict__ meta,  // 2bit metadata information about 2:4
-                                    // format on B
-    int4* __restrict__ C,           // fp16 output buffer of shape mxn
-    const int4* __restrict__ s,     // fp16 quantization scales of shape
-                                    // (k/groupsize)xn
-    int prob_m,                     // batch dimension m
-    int prob_n,                     // output dimension n
-    int prob_k,                     // reduction dimension k
-    int* locks  // extra global storage for barrier synchronization
-) {
-  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
-  // same size, which might involve multiple column "slices" (of width 16 *
-  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
-  // example:
-  //   0 1 3
-  //   0 2 3
-  //   1 2 4
-  // While this kind of partitioning makes things somewhat more complicated, it
-  // ensures good utilization of all SMs for many kinds of shape and GPU
-  // configurations, while requiring as few slow global cross-threadblock
-  // reductions as possible.
-
-  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
-  // better partitioning with less reductions
-  int parallel = 1;
-  if (prob_m > 16 * thread_m_blocks) {
-    parallel = prob_m / (16 * thread_m_blocks);
-    prob_m = 16 * thread_m_blocks;
-  }
-
-  // number of thread_k_blocks in k-dim
-  int k_tiles = prob_k / 32 / thread_k_blocks;
-  // number of thread_n_blocks in n-dim
-  int n_tiles = prob_n / 16 / thread_n_blocks;
-  // iters needed to cover all slices
-  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
-
-  // Ensure that the number of tiles in each stripe is a multiple of the
-  // groupsize; this avoids an annoying special case where a stripe starts in
-  // the middle of group.
-  if (group_blocks != -1)
-    iters = (group_blocks / thread_k_blocks) *
-            ceildiv(iters, (group_blocks / thread_k_blocks));
-
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  // number of threadblock tiles in the current slice
-  int slice_iters;
-  // total number of active threadblocks in the current slice
-  int slice_count = 0;
-  // index of threadblock in current slice; numbered bottom to top
-  int slice_idx;
-
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
-    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
-    locks += (slice_col_par / n_tiles) * n_tiles;
-    slice_col = slice_col_par % n_tiles;
-  }
-
-  // Compute all information about the current slice which is required for
-  // synchronization.
-  auto init_slice = [&]() {
-    slice_iters =
-        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
-    if (slice_iters == 0) return;
-    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
-    slice_count = 1;
-    slice_idx = 0;
-    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
-    if (col_first <= k_tiles * (slice_col_par + 1)) {
-      int col_off = col_first - k_tiles * slice_col_par;
-      slice_count = ceildiv(k_tiles - col_off, iters);
-      if (col_off > 0) slice_count++;
-      int delta_first = iters * blockIdx.x - col_first;
-      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
-        slice_idx = slice_count - 1;
-      else {
-        slice_idx = slice_count - 1 - delta_first / iters;
-        if (col_off > 0) slice_idx--;
-      }
-    }
-    if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 8;
-      C += 16 * thread_m_blocks * prob_n / 8;
-      locks += n_tiles;
-      slice_col = 0;
-    }
-  };
-  init_slice();
-
-  // RLC: 8 is vec_size -> 128-bit instructions, 8 fp16 elements
-  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
-
-  // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 32 * thread_k_blocks / 8;
-  // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 32 * thread_k_blocks / 8;
-  // between subsequent accesses within a tile
-  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory writes
-  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads //RLC: 2 * #warps k-dim
-  constexpr int a_sh_rd_delta_o = 4 * ((threads / 32) / (thread_n_blocks / 4));
-  // within a shared memory tile
-  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
-  // overall size of a tile
-  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
-  // number of shared write iterations for a tile
-  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
-
-  constexpr int pack_factor = 32 / num_bits;
-
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
-  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
-  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
-  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
-
-  int m_gl_stride = 2 * prob_n / 8;  // (16*2*4 / 8) = 16
-  constexpr int m_sh_stride =
-      (16 * thread_n_blocks) / 4;  // #warps n-dim * threads/warp
-  int m_gl_rd_delta_o = m_gl_stride * thread_k_blocks;
-  int m_gl_rd_delta_i = m_gl_stride * (threads / m_sh_stride);
-  constexpr int m_sh_wr_delta = threads / 2;
-  constexpr int m_sh_rd_delta = threads / 2;
-  constexpr int m_sh_stage = m_sh_stride * thread_k_blocks;
-  constexpr int m_sh_iters = ceildiv(m_sh_stage, m_sh_wr_delta);
-
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
-  constexpr int s_sh_stage = s_sh_stride;
-  int s_gl_rd_delta = s_gl_stride;
-
-  // Global A read index of current thread.
-  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  a_gl_rd += a_gl_rd_delta_o * slice_row;
-  // Shared write index of current thread.
-  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  // Shared read index.
-  int a_sh_rd =
-      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
-  a_sh_rd += 4 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
-
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
-  b_gl_rd += b_sh_stride * slice_col;
-  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
-
-  int m_gl_rd = m_gl_stride * (threadIdx.x / (m_sh_stride)) +
-                (threadIdx.x % (m_sh_stride));
-  m_gl_rd += (m_sh_stride)*slice_col;
-  m_gl_rd += m_gl_rd_delta_o * slice_row;
-  int m_sh_wr = threadIdx.x;
-  int m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
-
-  int s_gl_rd;
-  if constexpr (group_blocks == -1) {
-    s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-  } else {
-    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
-              s_sh_stride * slice_col + threadIdx.x;
-  }
-
-  int s_sh_wr = threadIdx.x;
-  int s_sh_rd;
-  // We use a different scale layout for grouped and column-wise quantization as
-  // we scale a `half2` tile in column-major layout in the former and in
-  // row-major in the latter case.
-  if (group_blocks != -1) {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  } else {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  }
-
-  // Precompute which thread should not read memory in which iterations; this is
-  // needed if there are more threads than required for a certain tilesize or
-  // when the batchsize is not a multiple of 16.
-  bool a_sh_wr_pred[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++) {
-    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
-  }
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
-
-  // To ensure that writing and reading A tiles to/from shared memory, the
-  // latter in fragment format, is fully bank conflict free, we need to use a
-  // rather fancy XOR-based layout. The key here is that neither reads nor
-  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
-  // same shared memory banks. Further, it seems (based on NSight-Compute) that
-  // each warp must also write a consecutive memory segment?
-  auto transform_a = [&](int i) {
-    int row = i / a_gl_rd_delta_o;
-    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
-  };
-  // Since the computation of this remapping is non-trivial and, due to our main
-  // loop unrolls, all shared memory accesses are static, we simply precompute
-  // both transformed reads and writes.
-  int a_sh_wr_trans[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
-  int a_sh_rd_trans[2][b_sh_wr_iters][thread_m_blocks];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-    for (int j = 0; j < thread_m_blocks; j++) {
-      a_sh_rd_trans[0][i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
-      a_sh_rd_trans[1][i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd + 2);
-    }
-  }
-
-  // Since B-accesses have non-constant stride they have to be computed at
-  // runtime; we break dependencies between subsequent accesses with a tile by
-  // maintining multiple pointers (we have enough registers), a tiny
-  // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
-
-  bool m_sh_wr_pred = threadIdx.x < m_sh_wr_delta;
-  const int4* meta_ptr[m_sh_iters];
-  #pragma unroll
-  for (int i = 0; i < m_sh_iters; i++)
-    meta_ptr[i] = meta + m_gl_rd_delta_i * i + m_gl_rd;
-
-  extern __shared__ int4 sh[];
-  // Shared memory storage for global fetch pipelines.
-  int4* sh_a = sh;
-  int4* sh_b = sh_a + (stages * a_sh_stage);
-  int4* sh_s = sh_b + (stages * b_sh_stage);
-  int4* sh_m = sh_s + (stages * s_sh_stage);
-  // Register storage for double buffer of shared memory reads.
-  FragA frag_a[2][thread_m_blocks][2];
-  I4 frag_b_quant[2][b_thread_vecs];
-  FragM frag_m[2][2];
-  FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];
-
-  // Zero accumulators.
-  auto zero_accums = [&]() {
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
-      reinterpret_cast<float*>(frag_c)[i] = 0;
-  };
-
-  // Asynchronously fetch the next A, B and s tile from global to the next
-  // shared memory pipeline location.
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
-    if (pred) {
-      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < a_sh_wr_iters; i++) {
-        cp_async4_pred(
-            &sh_a_stage[a_sh_wr_trans[i]],
-            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
-            a_sh_wr_pred[i]);
-      }
-      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
-        }
-        B_ptr[i] += b_gl_rd_delta_o;
-      }
-      int4* sh_meta_stage = sh_m + m_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < m_sh_iters; i++) {
-        if (m_sh_wr_pred)
-          cp_async4(&sh_meta_stage[m_sh_wr_delta * i + m_sh_wr], meta_ptr[i]);
-        meta_ptr[i] += m_gl_rd_delta_o;
-      }
-      // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
-      }
-    }
-    // Insert a fence even when we are winding down the pipeline to ensure that
-    // waiting is also correct at this point.
-    cp_async_fence();
-  };
-
-  // Wait until the next thread tile has been loaded to shared memory.
-  auto wait_for_stage = [&]() {
-    // We only have `stages - 2` active fetches since we are double buffering
-    // and can only issue the next fetch when it is guaranteed that the previous
-    // shared memory load is fully complete (as it may otherwise be
-    // overwritten).
-    cp_async_wait<stages - 2>();
-    __syncthreads();
-  };
-
-  // Load the next sub-tile from the current location in the shared memory pipe
-  // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe) {
-    // It may seem inefficient that we reload the groups for every sub-tile;
-    // however, this does not seem to be a significant bottleneck, while some
-    // theoretically better attempts have lead to bad instruction ordering by
-    // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
-      int4* sh_s_stage =
-          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
-      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-    }
-    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks; i++) {
-      ldsm4(frag_a[k % 2][i][0],
-            &sh_a_stage[a_sh_rd_trans[0][k % b_sh_wr_iters][i]]);
-      ldsm4(frag_a[k % 2][i][1],
-            &sh_a_stage[a_sh_rd_trans[1][k % b_sh_wr_iters][i]]);
-    }
-
-    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-  #pragma unroll
-    for (int i = 0; i < b_thread_vecs; i++) {
-      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
-    }
-
-    // Load meta with ldsm4
-    int4* sh_m_stage = sh_m + m_sh_stage * pipe;
-    ldsm4_m(frag_m[k % 2][0],
-            &sh_m_stage[m_sh_rd_delta * (k % m_sh_iters) + m_sh_rd]);
-  };
-
-  // Execute the actual tensor core matmul of a sub-tile.
-  auto matmul = [&](int k) {
-  // We have the m dimension as the inner loop in order to encourage overlapping
-  // dequantization and matmul operations.
-  #pragma unroll
-    for (int j = 0; j < 4; j++) {
-      FragB frag_b0;
-      FragB frag_b1;
-
-      if constexpr (num_bits == 4) {
-        int b_quant = frag_b_quant[k % 2][0][j];
-        int b_quant_shift = b_quant >> 8;
-
-        frag_b0 = dequant_4bit(b_quant);
-        frag_b1 = dequant_4bit(b_quant_shift);
-
-      } else {
-        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
-        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
-        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
-
-        frag_b0 = dequant_8bit(b_quant_0);
-        frag_b1 = dequant_8bit(b_quant_1);
-      }
-
-      // If there are no groups, we can just scale the final output once and can
-      // avoid doing so for each weight.
-      if constexpr (group_blocks != -1) {
-        scale(frag_b0, frag_s[k % 2][j], 0);
-      }
-      if constexpr (group_blocks != -1) {
-        scale(frag_b1, frag_s[k % 2][j], 1);
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-        mma_sp(frag_b0, frag_b1, frag_a[k % 2][i][0], frag_c[i][j][0],
-               frag_m[k % 2][j / 2], j % 2);
-      }
-    }
-  };
-
-  // Since we slice across the k dimension of a tile in order to increase the
-  // number of warps while keeping the n dimension of a tile reasonable, we have
-  // multiple warps that accumulate their partial sums of the same output
-  // location; which we have to reduce over in the end. We do in shared memory.
-  auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride_threads / 2;
-    if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride_threads;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
-                      (threadIdx.x % b_sh_stride_threads);
-
-  // Parallel logarithmic shared memory reduction. We make sure to avoid any
-  // unnecessary read or write iterations, e.g., for two warps we write only
-  // once by warp 1 and read only once by warp 0.
-  #pragma unroll
-      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
-  #pragma unroll
-        for (int i = red_off; i > 0; i /= 2) {
-          if (i <= red_idx && red_idx < 2 * i) {
-  #pragma unroll
-            for (int j = 0; j < 4 * 2; j++) {
-              int red_sh_wr =
-                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
-              if (i < red_off) {
-                float* c_rd =
-                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
-                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
-  #pragma unroll
-                for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
-                      c_rd[k] + c_wr[k];
-              }
-              sh[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
-            }
-          }
-          __syncthreads();
-        }
-        if (red_idx == 0) {
-  #pragma unroll
-          for (int i = 0; i < 4 * 2; i++) {
-            float* c_rd =
-                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
-  #pragma unroll
-            for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
-          }
-        }
-        __syncthreads();
-      }
-    }
-  };
-
-  // Since multiple threadblocks may process parts of the same column slice, we
-  // finally have to globally reduce over the results. As the striped
-  // partitioning minimizes the number of such reductions and our outputs are
-  // usually rather small, we perform this reduction serially in L2 cache.
-  auto global_reduce = [&](bool first = false, bool last = false) {
-    // We are very careful here to reduce directly in the output buffer to
-    // maximize L2 cache utilization in this step. To do this, we write out
-    // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
-    if (threadIdx.x < active_threads) {
-      int c_gl_stride = prob_n / 8;
-      int c_gl_wr_delta_o = 2 * 4 * c_gl_stride;
-      int c_gl_wr_delta_i =
-          c_gl_stride;  // 8 threads (e.g., 0,4,8,12,16,20,24,28)
-      int c_gl_wr = 2 * c_gl_stride * (threadIdx.x % 4) +
-                    8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4;
-      c_gl_wr += (2 * thread_n_blocks) * slice_col;
-      constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
-
-      int col = 2 * ((threadIdx.x % 32) % 4);
-
-      if (!first) {
-  // Interestingly, doing direct global accesses here really seems to mess up
-  // the compiler and lead to slowdowns, hence we also use async-copies even
-  // though these fetches are not actually asynchronous.
-  #pragma unroll
-        for (int i = 0; i < thread_m_blocks * 4; i++) {
-          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i],
-                         &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                            c_gl_wr_delta_i * (i % 2)],
-                         i < (thread_m_blocks - 1) * 4 ||
-                             8 * (i / 2) + col + (i % 2) < prob_m);
-        }
-        cp_async_fence();
-        cp_async_wait<0>();
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks * 4; i++) {
-        if (i < (thread_m_blocks - 1) * 4 ||
-            8 * (i / 2) + col + (i % 2) < prob_m) {
-          if (!first) {
-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
-  #pragma unroll
-            for (int j2 = 0; j2 < 2; j2++) {
-  #pragma unroll
-              for (int j1 = 0; j1 < 4; j1++) {
-                reinterpret_cast<float*>(
-                    &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 +
-                             4 * ((i % 4) / 2) + i % 2] +=
-                    __half2float(
-                        reinterpret_cast<__half*>(&c_red)[(j2 * 4 + j1)]);
-              }
-            }
-          }
-          if (!last) {
-            int4 c;
-  #pragma unroll
-            for (int j2 = 0; j2 < 2; j2++) {
-  #pragma unroll
-              for (int j1 = 0; j1 < 4; j1++) {
-                reinterpret_cast<__half*>(&c)[(j2 * 4 + j1)] =
-                    __float2half(reinterpret_cast<float*>(
-                        &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 +
-                                 4 * ((i % 4) / 2) + i % 2]);
-              }
-            }
-            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
-                c;
-          }
-        }
-      }
-    }
-  };
-
-  // Write out the reduce final result in the correct layout. We only actually
-  // reshuffle matrix fragments in this step, the reduction above is performed
-  // in fragment layout.
-  auto write_result = [&]() {
-    int c_gl_stride = prob_n / 8;
-
-    constexpr int c_sh_stride = 2 * thread_n_blocks;              // RLC:
-    constexpr int c_sh_stride_2 = 2 * c_sh_stride + 2;            // RLC:
-    constexpr int c_sh_stride_3 = 2 * (2 * thread_n_blocks) + 2;  // RLC:
-
-    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
-
-    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-    c_gl_wr += (2 * thread_n_blocks) * slice_col;
-
-    int c_sh_wr = c_sh_stride_2 * ((threadIdx.x % 32) % 4) +
-                  ((threadIdx.x % 32) / 4);  // RLC:
-    c_sh_wr += 8 * (threadIdx.x / 32);       // 128/4(half4)
-
-    constexpr int c_sh_rd_delta =
-        c_sh_stride_3 * (threads / (2 * 2 * thread_n_blocks));  // RLC:
-    int c_sh_rd = c_sh_stride_3 * (threadIdx.x / (2 * 2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * 2 * thread_n_blocks));
-
-    int c_gl_wr_end = c_gl_stride * prob_m;
-
-    auto write = [&](int idx, float c0, float c1, float c2, float c3, FragS& s0,
-                     float c4, float c5, float c6, float c7, FragS& s1) {
-      uint2 res[2];
-      res[0] = to_half4(c0, c1, c2, c3);
-      res[1] = to_half4(c4, c5, c6, c7);
-      half2* tmp = (half2*)&res;
-      // for per-column quantization we finally apply the scale here
-      if constexpr (group_blocks == -1 && num_bits == 4) {
-        tmp[0] = __hmul2(tmp[0], s0[0]);
-        tmp[1] = __hmul2(tmp[1], s0[1]);
-        tmp[2] = __hmul2(tmp[2], s1[0]);
-        tmp[3] = __hmul2(tmp[3], s1[1]);
-      }
-      ((int4*)sh)[idx] = *((int4*)&res[0]);
-    };
-
-    // RLC:  only warp 0 and 1 baseline example
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-        int wr = c_sh_wr;
-        write(wr, frag_c[i][0][0][0], frag_c[i][1][0][0], frag_c[i][2][0][0],
-              frag_c[i][3][0][0], frag_s[0][0], frag_c[i][0][0][2],
-              frag_c[i][1][0][2], frag_c[i][2][0][2], frag_c[i][3][0][2],
-              frag_s[0][2]);
-        write(wr + c_sh_stride, frag_c[i][0][0][1], frag_c[i][1][0][1],
-              frag_c[i][2][0][1], frag_c[i][3][0][1], frag_s[0][0],
-              frag_c[i][0][0][3], frag_c[i][1][0][3], frag_c[i][2][0][3],
-              frag_c[i][3][0][3], frag_s[0][2]);
-        write(wr + 4 * c_sh_stride_2, frag_c[i][0][1][0], frag_c[i][1][1][0],
-              frag_c[i][2][1][0], frag_c[i][3][1][0], frag_s[0][0],
-              frag_c[i][0][1][2], frag_c[i][1][1][2], frag_c[i][2][1][2],
-              frag_c[i][3][1][2], frag_s[0][2]);
-        write(wr + 4 * c_sh_stride_2 + c_sh_stride, frag_c[i][0][1][1],
-              frag_c[i][1][1][1], frag_c[i][2][1][1], frag_c[i][3][1][1],
-              frag_s[0][0], frag_c[i][0][1][3], frag_c[i][1][1][3],
-              frag_c[i][2][1][3], frag_c[i][3][1][3], frag_s[0][2]);
-
-        c_sh_wr += 8 * c_sh_stride_2;
-      }
-    }
-    __syncthreads();
-
-  #pragma unroll
-    for (int i = 0;
-         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
-         i++) {
-      if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh[c_sh_rd];
-        c_gl_wr += c_gl_wr_delta;
-        c_sh_rd += c_sh_rd_delta;
-      }
-    }
-  };
-
-  // Start global fetch and register load pipelines.
-  auto start_pipes = [&]() {
-  #pragma unroll
-    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
-    zero_accums();
-    wait_for_stage();
-    fetch_to_registers(0, 0);
-    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
-  };
-  start_pipes();
-
-  // Main loop.
-  while (slice_iters) {
-  // We unroll over both the global fetch and the register load pipeline to
-  // ensure all shared memory accesses are static. Note that both pipelines have
-  // even length meaning that the next iteration will always start at index 0.
-  #pragma unroll
-    for (int pipe = 0; pipe < stages;) {
-      fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                      slice_iters >= stages);
-      matmul(pipe);
-      wait_for_stage();
-
-      fetch_to_registers(pipe + 1, (pipe + 1) % stages);
-
-      pipe++;
-      slice_iters--;
-      if (slice_iters == 0) break;
-    }
-    a_gl_rd += a_gl_rd_delta_o * stages;
-
-    // Process results and, if necessary, proceed to the next column slice.
-    // While this pattern may not be the most readable, other ways of writing
-    // the loop seemed to noticeably worse performance after compilation.
-    if (slice_iters == 0) {
-      cp_async_wait<0>();
-      bool last = slice_idx == slice_count - 1;
-      // For per-column scales, we only fetch them here in the final step before
-      // write-out
-      if constexpr (group_blocks == -1) {
-        if constexpr (num_bits == 8) {
-          if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
-          cp_async_fence();
-        } else {
-          if (last) {
-            if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
-            cp_async_fence();
-          }
-        }
-      }
-      thread_block_reduce();
-
-      if constexpr (group_blocks == -1) {
-        if constexpr (num_bits == 8) {
-          cp_async_wait<0>();
-          __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
-            *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]);
-          }
-        } else {
-          if (last) {
-            cp_async_wait<0>();
-            __syncthreads();
-            if (threadIdx.x / 32 < thread_n_blocks / 4) {
-              *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]);
-            }
-          }
-        }
-      }
-
-      // For 8-bit channelwise, we apply the scale before the global reduction
-      // that converts the fp32 results to fp16 (so that we avoid possible
-      // overflow in fp16)
-      if constexpr (group_blocks == -1 && num_bits == 8) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-          for (int i = 0; i < thread_m_blocks; i++) {
-            scale_floats(&frag_c[i][0][0][0], &frag_c[i][1][0][0],
-                         &frag_c[i][2][0][0], &frag_c[i][3][0][0], frag_s[0][0],
-                         &frag_c[i][0][0][2], &frag_c[i][1][0][2],
-                         &frag_c[i][2][0][2], &frag_c[i][3][0][2],
-                         frag_s[0][2]);
-
-            scale_floats(&frag_c[i][0][0][1], &frag_c[i][1][0][1],
-                         &frag_c[i][2][0][1], &frag_c[i][3][0][1], frag_s[0][0],
-                         &frag_c[i][0][0][3], &frag_c[i][1][0][3],
-                         &frag_c[i][2][0][3], &frag_c[i][3][0][3],
-                         frag_s[0][2]);
-
-            scale_floats(&frag_c[i][0][1][0], &frag_c[i][1][1][0],
-                         &frag_c[i][2][1][0], &frag_c[i][3][1][0], frag_s[0][0],
-                         &frag_c[i][0][1][2], &frag_c[i][1][1][2],
-                         &frag_c[i][2][1][2], &frag_c[i][3][1][2],
-                         frag_s[0][2]);
-
-            scale_floats(&frag_c[i][0][1][1], &frag_c[i][1][1][1],
-                         &frag_c[i][2][1][1], &frag_c[i][3][1][1], frag_s[0][0],
-                         &frag_c[i][0][1][3], &frag_c[i][1][1][3],
-                         &frag_c[i][2][1][3], &frag_c[i][3][1][3],
-                         frag_s[0][2]);
-          }
-        }
-      }
-
-      if (slice_count > 1) {  // only globally reduce if there is more than one
-                              // block in a slice
-        barrier_acquire(&locks[slice_col], slice_idx);
-        global_reduce(slice_idx == 0, last);
-        barrier_release(&locks[slice_col], last);
-      }
-      if (last)  // only the last block in a slice actually writes the result
-        write_result();
-
-      slice_row = 0;
-      slice_col_par++;
-      slice_col++;
-      init_slice();
-      if (slice_iters) {
-        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                  (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-  #pragma unroll
-        for (int i = 0; i < m_sh_iters; i++)
-          meta_ptr[i] += (m_sh_stride)-m_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-  #pragma unroll
-          for (int i = 0; i < m_sh_iters; i++) meta_ptr[i] -= m_gl_stride;
-        }
-        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-        start_pipes();
-      }
-    }
-  }
-}
-
-#endif
-
-#define CALL_IF_2_4(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
-                    THREAD_K_BLOCKS, GROUP_BLOCKS)                            \
-  else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&      \
-           thread_n_blocks == THREAD_N_BLOCKS &&                              \
-           thread_k_blocks == THREAD_K_BLOCKS &&                              \
-           group_blocks == GROUP_BLOCKS) {                                    \
-    cudaFuncSetAttribute(                                                     \
-        Marlin_24<NUM_BITS, THREADS, THREAD_N_BLOCKS, THREAD_M_BLOCKS,        \
-                  THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,                     \
-        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
-    Marlin_24<NUM_BITS, THREADS, THREAD_N_BLOCKS, THREAD_M_BLOCKS,            \
-              THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>                          \
-        <<<blocks, THREADS, max_shared_mem, stream>>>(A_ptr, B_ptr, meta_ptr, \
-                                                      C_ptr, s_ptr, prob_n,   \
-                                                      prob_m, prob_k, locks); \
-  }
-
-void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
-                     void* s, int prob_m, int prob_n, int prob_k,
-                     void* workspace, int num_bits, int groupsize = -1,
-                     int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
-                     int thread_m = -1, int sms = -1, int max_par = 16) {
-  int tot_n = prob_n;
-  int tot_n_blocks = ceildiv(tot_n, 16);
-  int pad = 16 * tot_n_blocks - tot_n;
-
-  if (sms == -1) {
-    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
-  }
-  TORCH_CHECK(sms > 0);
-
-  int max_shared_mem = 0;
-  cudaDeviceGetAttribute(&max_shared_mem,
-                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-  TORCH_CHECK(max_shared_mem > 0);
-
-  if (thread_k == -1 || thread_m == -1) {
-    if (prob_n <= 16) {
-      // For small batchizes, better partitioningif is slightly more important
-      // than better compute utilization
-      thread_k = 128;
-      thread_m = 128;
-    } else if (prob_n <= 256) {
-      thread_k = 64;
-      thread_m = 256;
-    } else {
-      thread_k = 32;
-      thread_m = 512;
-    }
-  }
-
-  int thread_k_blocks = thread_k / 32;  // 2:4 version with m16n8k32 instruction
-  int thread_m_blocks = thread_m / 16;
-  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
-  int blocks = sms;
-
-  TORCH_CHECK(prob_m % thread_m == 0, "prob_m = ", prob_m,
-              " is not divisible by thread_m = ", thread_m);
-  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
-              " is not divisible by thread_k = ", thread_k);
-  if (group_blocks != -1) {
-    TORCH_CHECK((prob_k / 2) % group_blocks == 0, "prob_k/2 = ", prob_k / 2,
-                " is not divisible by group_blocks = ", group_blocks);
-  }
-
-  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
-              ", ", prob_n, ", ", prob_k, "]");
-
-  const int4* A_ptr = (const int4*)A;
-  const int4* B_ptr = (const int4*)B;
-  const int4* meta_ptr = (const int4*)meta;
-  int4* C_ptr = (int4*)C;
-  const int4* s_ptr = (const int4*)s;
-
-  constexpr int max_m_blocks = 4;
-
-  int* locks = (int*)workspace;
-  for (int i = 0; i < tot_n_blocks; i += max_m_blocks) {
-    int thread_n_blocks = tot_n_blocks - i;
-    prob_n = tot_n - 16 * i;
-    int par = 1;
-    if (thread_n_blocks > max_m_blocks) {
-      // Note that parallel > 1 currently only works for inputs without any
-      // padding
-      par = (16 * thread_n_blocks - pad) / (max_m_blocks * 16);
-      if (par > max_par) par = max_par;
-      prob_n = (max_m_blocks * 16) * par;
-      i += max_m_blocks * (par - 1);
-      thread_n_blocks = max_m_blocks;
-    }
-
-    // For compilation speed, we only define the kernel configurations that have
-    // seemed useful (in terms of performance) in our testing, however many more
-    // are, in principle, possible.
-
-    // the false is start of the CALL_IF macros
-    if (false) {
-    }  //         BMxBNxBK,   group
-    // 4-bit
-    CALL_IF_2_4(4, 8, 1, 4, -1)  // e.g., 16x128x128
-    CALL_IF_2_4(4, 8, 1, 4, 4)   // e.g., 16x128x128, 64
-
-    CALL_IF_2_4(4, 16, 1, 2, -1)  // e.g., 16x256x64
-    CALL_IF_2_4(4, 16, 1, 2, 4)   // e.g., 16x256x64,  64
-    CALL_IF_2_4(4, 16, 2, 2, -1)  // e.g.. 32x256x64
-    CALL_IF_2_4(4, 16, 2, 2, 4)
-    CALL_IF_2_4(4, 16, 3, 2, -1)
-    CALL_IF_2_4(4, 16, 3, 2, 4)
-    CALL_IF_2_4(4, 16, 4, 2, -1)
-    CALL_IF_2_4(4, 16, 4, 2, 4)
-
-    CALL_IF_2_4(4, 32, 1, 1, -1)  // e.g., 16x256x64
-    CALL_IF_2_4(4, 32, 1, 1, 4)   // e.g., 16x256x64,  64
-    CALL_IF_2_4(4, 32, 2, 1, -1)  // e.g.. 32x256x64
-    CALL_IF_2_4(4, 32, 2, 1, 4)
-    CALL_IF_2_4(4, 32, 3, 1, -1)
-    CALL_IF_2_4(4, 32, 3, 1, 4)
-    CALL_IF_2_4(4, 32, 4, 1, -1)
-    CALL_IF_2_4(4, 32, 4, 1, 4)
-
-    // 8-bit
-    CALL_IF_2_4(8, 8, 1, 4, -1)  // e.g., 16x128x128
-    CALL_IF_2_4(8, 8, 1, 4, 4)   // e.g., 16x128x128, 64
-
-    CALL_IF_2_4(8, 16, 1, 2, -1)  // e.g., 16x256x64
-    CALL_IF_2_4(8, 16, 1, 2, 4)   // e.g., 16x256x64,  64
-    CALL_IF_2_4(8, 16, 2, 2, -1)  // e.g.. 32x256x64
-    CALL_IF_2_4(8, 16, 2, 2, 4)
-    CALL_IF_2_4(8, 16, 3, 2, -1)
-    CALL_IF_2_4(8, 16, 3, 2, 4)
-    CALL_IF_2_4(8, 16, 4, 2, -1)
-    CALL_IF_2_4(8, 16, 4, 2, 4)
-
-    CALL_IF_2_4(8, 32, 1, 1, -1)  // e.g., 16x256x64
-    CALL_IF_2_4(8, 32, 1, 1, 4)   // e.g., 16x256x64,  64
-    CALL_IF_2_4(8, 32, 2, 1, -1)  // e.g.. 32x256x64
-    CALL_IF_2_4(8, 32, 2, 1, 4)
-    CALL_IF_2_4(8, 32, 3, 1, -1)
-    CALL_IF_2_4(8, 32, 3, 1, 4)
-    CALL_IF_2_4(8, 32, 4, 1, -1)
-    CALL_IF_2_4(8, 32, 4, 1, 4)
-    else {
-      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
-                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
-                               ", groupsize = " + str(groupsize) +
-                               ", thread_m_blocks = " + str(thread_m_blocks) +
-                               ", thread_n_blocks = " + str(thread_n_blocks) +
-                               ", thread_k_blocks = " + str(thread_k_blocks));
-    }
-
-    A_ptr += 16 * thread_n_blocks * (prob_k / 8) * par;
-    C_ptr += 16 * thread_n_blocks * (prob_m / 8) * par;
-  }
-}
-
-}  // namespace marlin_24
-
-torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                                  torch::Tensor& b_meta,
-                                  torch::Tensor& b_scales,
-                                  torch::Tensor& workspace, int64_t num_bits,
-                                  int64_t size_m, int64_t size_n,
-                                  int64_t size_k) {
-  // Verify num_bits
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
-  int pack_factor = 32 / num_bits;
-
-  // Verify M
-  TORCH_CHECK(size_m == a.size(0),
-              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
-                  ", size_m = " + str(size_m));
-
-  // Verify K
-  TORCH_CHECK(size_k == a.size(1),
-              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
-                  ", size_k = " + str(size_k));
-  TORCH_CHECK(size_k % marlin_24::tile_size == 0,
-              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
-                  str(marlin_24::tile_size));
-  TORCH_CHECK((size_k / marlin_24::tile_size / 2) == b_q_weight.size(0),
-              "Shape mismatch: b_q_weight.size(0) = " +
-                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
-                  ", tile_size = " + str(marlin_24::tile_size));
-
-  // Verify N
-  TORCH_CHECK(b_scales.size(1) == size_n,
-              "b_scales.size(1) = " + str(b_scales.size(1)) +
-                  ", size_n = " + str(size_n));
-  TORCH_CHECK(
-      b_q_weight.size(1) % marlin_24::tile_size == 0,
-      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
-          " is not divisible by tile_size = " + str(marlin_24::tile_size));
-
-  int actual_size_n = (b_q_weight.size(1) / marlin_24::tile_size) * pack_factor;
-  TORCH_CHECK(
-      size_n == actual_size_n,
-      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
-
-  // Verify meta
-  TORCH_CHECK(b_meta.size(0) == size_k / 8 / 2 / 2,
-              "b_meta.size(0) = ", b_meta.size(0),
-              " is not size_k / 8 / 2 / 2 = ", size_k / 8 / 2 / 2);
-  TORCH_CHECK(b_meta.size(1) == size_n * 2, "b_meta.size(1) = ", b_meta.size(1),
-              " is not size_n * 2 = ", size_n * 2);
-
-  // Verify A device and strides
-  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
-
-  // Verify B device and strides
-  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
-  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
-
-  // Verify b_meta device and strides
-  TORCH_CHECK(b_meta.device().is_cuda(), "b_meta is not on GPU");
-  TORCH_CHECK(b_meta.is_contiguous(), "b_meta is not contiguous");
-
-  // Verify scales device and strides
-  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
-  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
-
-  // Alloc C matrix
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  torch::Tensor c = torch::empty({size_m, size_n}, options);
-
-  int thread_k = -1;
-  int thread_m = -1;
-  int sms = -1;
-  int max_par = marlin_24::max_par;
-
-  int groupsize = -1;
-  if (b_scales.size(0) > 1) {
-    TORCH_CHECK(size_k % b_scales.size(0) == 0,
-                "size_k = " + str(size_k) +
-                    ", is not divisible by b_scales.size(0) = " +
-                    str(b_scales.size(0)));
-    groupsize = size_k / b_scales.size(0);
-    groupsize /= 2;  // Because of 24
-  }
-
-  // Verify groupsize
-  TORCH_CHECK(groupsize == -1 || groupsize == 64,
-              "Unexpected groupsize = " + str(groupsize));
-
-  // Verify workspace size
-  TORCH_CHECK(size_n % marlin_24::min_thread_n == 0,
-              "size_n = " + str(size_n) +
-                  ", is not divisible by min_thread_n = " +
-                  str(marlin_24::min_thread_n));
-  int min_workspace_size =
-      (size_n / marlin_24::min_thread_n) * marlin_24::max_par;
-  TORCH_CHECK(workspace.numel() >= min_workspace_size,
-              "workspace.numel = " + str(workspace.numel()) +
-                  " is below min_workspace_size = " + str(min_workspace_size));
-
-  int dev = a.get_device();
-  marlin_24::marlin_cuda_2_4(
-      a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(),
-      b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(),
-      num_bits, groupsize, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
-      thread_m, sms, max_par);
-
-  return c;
-}
diff --git a/server/marlin/setup.py b/server/marlin/setup.py
deleted file mode 100644
index aed84e9eb4f913b1742c64f9f296ba3f709db439..0000000000000000000000000000000000000000
--- a/server/marlin/setup.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-extra_compile_args = []
-
-setup(
-    name="marlin_kernels",
-    ext_modules=[
-        CUDAExtension(
-            name="marlin_kernels",
-            sources=[
-                "marlin_kernels/gptq_marlin.cu",
-                "marlin_kernels/gptq_marlin_repack.cu",
-                "marlin_kernels/marlin_cuda_kernel.cu",
-                "marlin_kernels/sparse/marlin_24_cuda_kernel.cu",
-                "marlin_kernels/ext.cpp",
-            ],
-            extra_compile_args=extra_compile_args,
-        ),
-    ],
-    cmdclass={"build_ext": BuildExtension},
-)
diff --git a/server/poetry.lock b/server/poetry.lock
index 4984978a300a232983972d9271a8ed75a330091f..e75786c3383a8c15ea600c477b9193c9af32fbbf 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "accelerate"
@@ -30,101 +30,128 @@ test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "py
 test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"]
 testing = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
 
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.4.3"
+description = "Happy Eyeballs for asyncio"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"},
+    {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"},
+]
+
 [[package]]
 name = "aiohttp"
-version = "3.9.5"
+version = "3.10.10"
 description = "Async http client/server framework (asyncio)"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fcde4c397f673fdec23e6b05ebf8d4751314fa7c24f93334bf1f1364c1c69ac7"},
-    {file = "aiohttp-3.9.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d6b3f1fabe465e819aed2c421a6743d8debbde79b6a8600739300630a01bf2c"},
-    {file = "aiohttp-3.9.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6ae79c1bc12c34082d92bf9422764f799aee4746fd7a392db46b7fd357d4a17a"},
-    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d3ebb9e1316ec74277d19c5f482f98cc65a73ccd5430540d6d11682cd857430"},
-    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84dabd95154f43a2ea80deffec9cb44d2e301e38a0c9d331cc4aa0166fe28ae3"},
-    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c8a02fbeca6f63cb1f0475c799679057fc9268b77075ab7cf3f1c600e81dd46b"},
-    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72"},
-    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:714d4e5231fed4ba2762ed489b4aec07b2b9953cf4ee31e9871caac895a839c0"},
-    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7a6a8354f1b62e15d48e04350f13e726fa08b62c3d7b8401c0a1314f02e3558"},
-    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c413016880e03e69d166efb5a1a95d40f83d5a3a648d16486592c49ffb76d0db"},
-    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ff84aeb864e0fac81f676be9f4685f0527b660f1efdc40dcede3c251ef1e867f"},
-    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ad7f2919d7dac062f24d6f5fe95d401597fbb015a25771f85e692d043c9d7832"},
-    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:702e2c7c187c1a498a4e2b03155d52658fdd6fda882d3d7fbb891a5cf108bb10"},
-    {file = "aiohttp-3.9.5-cp310-cp310-win32.whl", hash = "sha256:67c3119f5ddc7261d47163ed86d760ddf0e625cd6246b4ed852e82159617b5fb"},
-    {file = "aiohttp-3.9.5-cp310-cp310-win_amd64.whl", hash = "sha256:471f0ef53ccedec9995287f02caf0c068732f026455f07db3f01a46e49d76bbb"},
-    {file = "aiohttp-3.9.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e0ae53e33ee7476dd3d1132f932eeb39bf6125083820049d06edcdca4381f342"},
-    {file = "aiohttp-3.9.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c088c4d70d21f8ca5c0b8b5403fe84a7bc8e024161febdd4ef04575ef35d474d"},
-    {file = "aiohttp-3.9.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:639d0042b7670222f33b0028de6b4e2fad6451462ce7df2af8aee37dcac55424"},
-    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f26383adb94da5e7fb388d441bf09c61e5e35f455a3217bfd790c6b6bc64b2ee"},
-    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:66331d00fb28dc90aa606d9a54304af76b335ae204d1836f65797d6fe27f1ca2"},
-    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ff550491f5492ab5ed3533e76b8567f4b37bd2995e780a1f46bca2024223233"},
-    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f22eb3a6c1080d862befa0a89c380b4dafce29dc6cd56083f630073d102eb595"},
-    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a81b1143d42b66ffc40a441379387076243ef7b51019204fd3ec36b9f69e77d6"},
-    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f64fd07515dad67f24b6ea4a66ae2876c01031de91c93075b8093f07c0a2d93d"},
-    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:93e22add827447d2e26d67c9ac0161756007f152fdc5210277d00a85f6c92323"},
-    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:55b39c8684a46e56ef8c8d24faf02de4a2b2ac60d26cee93bc595651ff545de9"},
-    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4715a9b778f4293b9f8ae7a0a7cef9829f02ff8d6277a39d7f40565c737d3771"},
-    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afc52b8d969eff14e069a710057d15ab9ac17cd4b6753042c407dcea0e40bf75"},
-    {file = "aiohttp-3.9.5-cp311-cp311-win32.whl", hash = "sha256:b3df71da99c98534be076196791adca8819761f0bf6e08e07fd7da25127150d6"},
-    {file = "aiohttp-3.9.5-cp311-cp311-win_amd64.whl", hash = "sha256:88e311d98cc0bf45b62fc46c66753a83445f5ab20038bcc1b8a1cc05666f428a"},
-    {file = "aiohttp-3.9.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c7a4b7a6cf5b6eb11e109a9755fd4fda7d57395f8c575e166d363b9fc3ec4678"},
-    {file = "aiohttp-3.9.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0a158704edf0abcac8ac371fbb54044f3270bdbc93e254a82b6c82be1ef08f3c"},
-    {file = "aiohttp-3.9.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d153f652a687a8e95ad367a86a61e8d53d528b0530ef382ec5aaf533140ed00f"},
-    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82a6a97d9771cb48ae16979c3a3a9a18b600a8505b1115cfe354dfb2054468b4"},
-    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60cdbd56f4cad9f69c35eaac0fbbdf1f77b0ff9456cebd4902f3dd1cf096464c"},
-    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8676e8fd73141ded15ea586de0b7cda1542960a7b9ad89b2b06428e97125d4fa"},
-    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da00da442a0e31f1c69d26d224e1efd3a1ca5bcbf210978a2ca7426dfcae9f58"},
-    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18f634d540dd099c262e9f887c8bbacc959847cfe5da7a0e2e1cf3f14dbf2daf"},
-    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:320e8618eda64e19d11bdb3bd04ccc0a816c17eaecb7e4945d01deee2a22f95f"},
-    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:2faa61a904b83142747fc6a6d7ad8fccff898c849123030f8e75d5d967fd4a81"},
-    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:8c64a6dc3fe5db7b1b4d2b5cb84c4f677768bdc340611eca673afb7cf416ef5a"},
-    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:393c7aba2b55559ef7ab791c94b44f7482a07bf7640d17b341b79081f5e5cd1a"},
-    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c671dc117c2c21a1ca10c116cfcd6e3e44da7fcde37bf83b2be485ab377b25da"},
-    {file = "aiohttp-3.9.5-cp312-cp312-win32.whl", hash = "sha256:5a7ee16aab26e76add4afc45e8f8206c95d1d75540f1039b84a03c3b3800dd59"},
-    {file = "aiohttp-3.9.5-cp312-cp312-win_amd64.whl", hash = "sha256:5ca51eadbd67045396bc92a4345d1790b7301c14d1848feaac1d6a6c9289e888"},
-    {file = "aiohttp-3.9.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:694d828b5c41255e54bc2dddb51a9f5150b4eefa9886e38b52605a05d96566e8"},
-    {file = "aiohttp-3.9.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0605cc2c0088fcaae79f01c913a38611ad09ba68ff482402d3410bf59039bfb8"},
-    {file = "aiohttp-3.9.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4558e5012ee03d2638c681e156461d37b7a113fe13970d438d95d10173d25f78"},
-    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dbc053ac75ccc63dc3a3cc547b98c7258ec35a215a92bd9f983e0aac95d3d5b"},
-    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4109adee842b90671f1b689901b948f347325045c15f46b39797ae1bf17019de"},
-    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6ea1a5b409a85477fd8e5ee6ad8f0e40bf2844c270955e09360418cfd09abac"},
-    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3c2890ca8c59ee683fd09adf32321a40fe1cf164e3387799efb2acebf090c11"},
-    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3916c8692dbd9d55c523374a3b8213e628424d19116ac4308e434dbf6d95bbdd"},
-    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d1964eb7617907c792ca00b341b5ec3e01ae8c280825deadbbd678447b127e1"},
-    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d5ab8e1f6bee051a4bf6195e38a5c13e5e161cb7bad83d8854524798bd9fcd6e"},
-    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:52c27110f3862a1afbcb2af4281fc9fdc40327fa286c4625dfee247c3ba90156"},
-    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:7f64cbd44443e80094309875d4f9c71d0401e966d191c3d469cde4642bc2e031"},
-    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8b4f72fbb66279624bfe83fd5eb6aea0022dad8eec62b71e7bf63ee1caadeafe"},
-    {file = "aiohttp-3.9.5-cp38-cp38-win32.whl", hash = "sha256:6380c039ec52866c06d69b5c7aad5478b24ed11696f0e72f6b807cfb261453da"},
-    {file = "aiohttp-3.9.5-cp38-cp38-win_amd64.whl", hash = "sha256:da22dab31d7180f8c3ac7c7635f3bcd53808f374f6aa333fe0b0b9e14b01f91a"},
-    {file = "aiohttp-3.9.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1732102949ff6087589408d76cd6dea656b93c896b011ecafff418c9661dc4ed"},
-    {file = "aiohttp-3.9.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c6021d296318cb6f9414b48e6a439a7f5d1f665464da507e8ff640848ee2a58a"},
-    {file = "aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:239f975589a944eeb1bad26b8b140a59a3a320067fb3cd10b75c3092405a1372"},
-    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b7b30258348082826d274504fbc7c849959f1989d86c29bc355107accec6cfb"},
-    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2adf5c87ff6d8b277814a28a535b59e20bfea40a101db6b3bdca7e9926bc24"},
-    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9a3d838441bebcf5cf442700e3963f58b5c33f015341f9ea86dcd7d503c07e2"},
-    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e3a1ae66e3d0c17cf65c08968a5ee3180c5a95920ec2731f53343fac9bad106"},
-    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c69e77370cce2d6df5d12b4e12bdcca60c47ba13d1cbbc8645dd005a20b738b"},
-    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0cbf56238f4bbf49dab8c2dc2e6b1b68502b1e88d335bea59b3f5b9f4c001475"},
-    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d1469f228cd9ffddd396d9948b8c9cd8022b6d1bf1e40c6f25b0fb90b4f893ed"},
-    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:45731330e754f5811c314901cebdf19dd776a44b31927fa4b4dbecab9e457b0c"},
-    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:3fcb4046d2904378e3aeea1df51f697b0467f2aac55d232c87ba162709478c46"},
-    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8cf142aa6c1a751fcb364158fd710b8a9be874b81889c2bd13aa8893197455e2"},
-    {file = "aiohttp-3.9.5-cp39-cp39-win32.whl", hash = "sha256:7b179eea70833c8dee51ec42f3b4097bd6370892fa93f510f76762105568cf09"},
-    {file = "aiohttp-3.9.5-cp39-cp39-win_amd64.whl", hash = "sha256:38d80498e2e169bc61418ff36170e0aad0cd268da8b38a17c4cf29d254a8b3f1"},
-    {file = "aiohttp-3.9.5.tar.gz", hash = "sha256:edea7d15772ceeb29db4aff55e482d4bcfb6ae160ce144f2682de02f6d693551"},
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"},
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"},
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:333cf6cf8e65f6a1e06e9eb3e643a0c515bb850d470902274239fea02033e9a8"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:274cfa632350225ce3fdeb318c23b4a10ec25c0e2c880eff951a3842cf358ac1"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9e5e4a85bdb56d224f412d9c98ae4cbd032cc4f3161818f692cd81766eee65a"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b606353da03edcc71130b52388d25f9a30a126e04caef1fd637e31683033abd"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab5a5a0c7a7991d90446a198689c0535be89bbd6b410a1f9a66688f0880ec026"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578a4b875af3e0daaf1ac6fa983d93e0bbfec3ead753b6d6f33d467100cdc67b"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8105fd8a890df77b76dd3054cddf01a879fc13e8af576805d667e0fa0224c35d"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3bcd391d083f636c06a68715e69467963d1f9600f85ef556ea82e9ef25f043f7"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fbc6264158392bad9df19537e872d476f7c57adf718944cc1e4495cbabf38e2a"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e48d5021a84d341bcaf95c8460b152cfbad770d28e5fe14a768988c461b821bc"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2609e9ab08474702cc67b7702dbb8a80e392c54613ebe80db7e8dbdb79837c68"},
+    {file = "aiohttp-3.10.10-cp310-cp310-win32.whl", hash = "sha256:84afcdea18eda514c25bc68b9af2a2b1adea7c08899175a51fe7c4fb6d551257"},
+    {file = "aiohttp-3.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:9c72109213eb9d3874f7ac8c0c5fa90e072d678e117d9061c06e30c85b4cf0e6"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c30a0eafc89d28e7f959281b58198a9fa5e99405f716c0289b7892ca345fe45f"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:258c5dd01afc10015866114e210fb7365f0d02d9d059c3c3415382ab633fcbcb"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:15ecd889a709b0080f02721255b3f80bb261c2293d3c748151274dfea93ac871"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3935f82f6f4a3820270842e90456ebad3af15810cf65932bd24da4463bc0a4c"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:413251f6fcf552a33c981c4709a6bba37b12710982fec8e558ae944bfb2abd38"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1720b4f14c78a3089562b8875b53e36b51c97c51adc53325a69b79b4b48ebcb"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:679abe5d3858b33c2cf74faec299fda60ea9de62916e8b67e625d65bf069a3b7"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79019094f87c9fb44f8d769e41dbb664d6e8fcfd62f665ccce36762deaa0e911"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe2fb38c2ed905a2582948e2de560675e9dfbee94c6d5ccdb1301c6d0a5bf092"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a3f00003de6eba42d6e94fabb4125600d6e484846dbf90ea8e48a800430cc142"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbb122c557a16fafc10354b9d99ebf2f2808a660d78202f10ba9d50786384b9"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30ca7c3b94708a9d7ae76ff281b2f47d8eaf2579cd05971b5dc681db8caac6e1"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:df9270660711670e68803107d55c2b5949c2e0f2e4896da176e1ecfc068b974a"},
+    {file = "aiohttp-3.10.10-cp311-cp311-win32.whl", hash = "sha256:aafc8ee9b742ce75044ae9a4d3e60e3d918d15a4c2e08a6c3c3e38fa59b92d94"},
+    {file = "aiohttp-3.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:362f641f9071e5f3ee6f8e7d37d5ed0d95aae656adf4ef578313ee585b585959"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9294bbb581f92770e6ed5c19559e1e99255e4ca604a22c5c6397b2f9dd3ee42c"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a8fa23fe62c436ccf23ff930149c047f060c7126eae3ccea005f0483f27b2e28"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c6a5b8c7926ba5d8545c7dd22961a107526562da31a7a32fa2456baf040939f"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:007ec22fbc573e5eb2fb7dec4198ef8f6bf2fe4ce20020798b2eb5d0abda6138"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9627cc1a10c8c409b5822a92d57a77f383b554463d1884008e051c32ab1b3742"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50edbcad60d8f0e3eccc68da67f37268b5144ecc34d59f27a02f9611c1d4eec7"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a45d85cf20b5e0d0aa5a8dca27cce8eddef3292bc29d72dcad1641f4ed50aa16"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b00807e2605f16e1e198f33a53ce3c4523114059b0c09c337209ae55e3823a8"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f2d4324a98062be0525d16f768a03e0bbb3b9fe301ceee99611dc9a7953124e6"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438cd072f75bb6612f2aca29f8bd7cdf6e35e8f160bc312e49fbecab77c99e3a"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:baa42524a82f75303f714108fea528ccacf0386af429b69fff141ffef1c534f9"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a7d8d14fe962153fc681f6366bdec33d4356f98a3e3567782aac1b6e0e40109a"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c1277cd707c465cd09572a774559a3cc7c7a28802eb3a2a9472588f062097205"},
+    {file = "aiohttp-3.10.10-cp312-cp312-win32.whl", hash = "sha256:59bb3c54aa420521dc4ce3cc2c3fe2ad82adf7b09403fa1f48ae45c0cbde6628"},
+    {file = "aiohttp-3.10.10-cp312-cp312-win_amd64.whl", hash = "sha256:0e1b370d8007c4ae31ee6db7f9a2fe801a42b146cec80a86766e7ad5c4a259cf"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ad7593bb24b2ab09e65e8a1d385606f0f47c65b5a2ae6c551db67d6653e78c28"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1eb89d3d29adaf533588f209768a9c02e44e4baf832b08118749c5fad191781d"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3fe407bf93533a6fa82dece0e74dbcaaf5d684e5a51862887f9eaebe6372cd79"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aed5155f819873d23520919e16703fc8925e509abbb1a1491b0087d1cd969e"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f05e9727ce409358baa615dbeb9b969db94324a79b5a5cea45d39bdb01d82e6"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dffb610a30d643983aeb185ce134f97f290f8935f0abccdd32c77bed9388b42"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6658732517ddabe22c9036479eabce6036655ba87a0224c612e1ae6af2087e"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:741a46d58677d8c733175d7e5aa618d277cd9d880301a380fd296975a9cdd7bc"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e00e3505cd80440f6c98c6d69269dcc2a119f86ad0a9fd70bccc59504bebd68a"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ffe595f10566f8276b76dc3a11ae4bb7eba1aac8ddd75811736a15b0d5311414"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdfcf6443637c148c4e1a20c48c566aa694fa5e288d34b20fcdc58507882fed3"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d183cf9c797a5291e8301790ed6d053480ed94070637bfaad914dd38b0981f67"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"},
+    {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"},
+    {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1b66ccafef7336a1e1f0e389901f60c1d920102315a56df85e49552308fc0486"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:acd48d5b80ee80f9432a165c0ac8cbf9253eaddb6113269a5e18699b33958dbb"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3455522392fb15ff549d92fbf4b73b559d5e43dc522588f7eb3e54c3f38beee7"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c3b868724137f713a38376fef8120c166d1eadd50da1855c112fe97954aed8"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da1dee8948d2137bb51fbb8a53cce6b1bcc86003c6b42565f008438b806cccd8"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5ce2ce7c997e1971b7184ee37deb6ea9922ef5163c6ee5aa3c274b05f9e12fa"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28529e08fde6f12eba8677f5a8608500ed33c086f974de68cc65ab218713a59d"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7db54c7914cc99d901d93a34704833568d86c20925b2762f9fa779f9cd2e70f"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03a42ac7895406220124c88911ebee31ba8b2d24c98507f4a8bf826b2937c7f2"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7e338c0523d024fad378b376a79faff37fafb3c001872a618cde1d322400a572"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:038f514fe39e235e9fef6717fbf944057bfa24f9b3db9ee551a7ecf584b5b480"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:64f6c17757251e2b8d885d728b6433d9d970573586a78b78ba8929b0f41d045a"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:93429602396f3383a797a2a70e5f1de5df8e35535d7806c9f91df06f297e109b"},
+    {file = "aiohttp-3.10.10-cp38-cp38-win32.whl", hash = "sha256:c823bc3971c44ab93e611ab1a46b1eafeae474c0c844aff4b7474287b75fe49c"},
+    {file = "aiohttp-3.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:54ca74df1be3c7ca1cf7f4c971c79c2daf48d9aa65dea1a662ae18926f5bc8ce"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"},
+    {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"},
+    {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"},
+    {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"},
 ]
 
 [package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
 async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
-yarl = ">=1.0,<2.0"
+yarl = ">=1.12.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli", "aiodns", "brotlicffi"]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
 
 [[package]]
 name = "aiosignal"
@@ -164,32 +191,32 @@ files = [
 
 [[package]]
 name = "attrs"
-version = "23.2.0"
+version = "24.2.0"
 description = "Classes Without Boilerplate"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"},
-    {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"},
+    {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
+    {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
 ]
 
 [package.extras]
-cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
-dev = ["attrs[tests]", "pre-commit"]
-docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
-tests = ["attrs[tests-no-zope]", "zope-interface"]
-tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"]
-tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"]
+benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
 
 [[package]]
 name = "bitsandbytes"
-version = "0.43.1"
+version = "0.43.3"
 description = "k-bit optimizers and matrix multiplication routines."
 optional = true
 python-versions = "*"
 files = [
-    {file = "bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:a81c826d576d6d691c7b4a7491c8fdc0f37f769795d6ca2e54afa605d2c260a3"},
-    {file = "bitsandbytes-0.43.1-py3-none-win_amd64.whl", hash = "sha256:52c1c7189a6ca006555a9663e544e75f40520a97a26e075411f9f9aca0771fcd"},
+    {file = "bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:cc99507c352be0715098b2c7577b690dd158972dc4ea10c7495bac104c7c79f0"},
+    {file = "bitsandbytes-0.43.3-py3-none-win_amd64.whl", hash = "sha256:257f6552f2144748a84e6c44e1f7a98f3da888f675ed74e18fd7f7eb13c6cafa"},
 ]
 
 [package.dependencies]
@@ -202,112 +229,127 @@ test = ["scipy"]
 
 [[package]]
 name = "certifi"
-version = "2024.6.2"
+version = "2024.8.30"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2024.6.2-py3-none-any.whl", hash = "sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56"},
-    {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
+    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
+    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
 ]
 
 [[package]]
 name = "charset-normalizer"
-version = "3.3.2"
+version = "3.4.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
-    {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
+    {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
+    {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
 ]
 
 [[package]]
@@ -326,13 +368,13 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
 [[package]]
 name = "cloudpickle"
-version = "3.0.0"
+version = "3.1.0"
 description = "Pickler class to extend the standard pickle.Pickler functionality"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "cloudpickle-3.0.0-py3-none-any.whl", hash = "sha256:246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7"},
-    {file = "cloudpickle-3.0.0.tar.gz", hash = "sha256:996d9a482c6fb4f33c1a35335cf8afd065d2a56e973270364840712d9131a882"},
+    {file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"},
+    {file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"},
 ]
 
 [[package]]
@@ -348,45 +390,47 @@ files = [
 
 [[package]]
 name = "datasets"
-version = "2.14.4"
+version = "2.21.0"
 description = "HuggingFace community-driven open-source library of datasets"
 optional = true
 python-versions = ">=3.8.0"
 files = [
-    {file = "datasets-2.14.4-py3-none-any.whl", hash = "sha256:29336bd316a7d827ccd4da2236596279b20ca2ac78f64c04c9483da7cbc2459b"},
-    {file = "datasets-2.14.4.tar.gz", hash = "sha256:ef29c2b5841de488cd343cfc26ab979bff77efa4d2285af51f1ad7db5c46a83b"},
+    {file = "datasets-2.21.0-py3-none-any.whl", hash = "sha256:25e4e097110ce28824b746a107727ada94024cba11db8bc588d468414692b65a"},
+    {file = "datasets-2.21.0.tar.gz", hash = "sha256:998f85a8460f1bd982e5bd058f8a0808eef424249e3df1e8cdd594ccd0dc8ba2"},
 ]
 
 [package.dependencies]
 aiohttp = "*"
-dill = ">=0.3.0,<0.3.8"
-fsspec = {version = ">=2021.11.1", extras = ["http"]}
-huggingface-hub = ">=0.14.0,<1.0.0"
+dill = ">=0.3.0,<0.3.9"
+filelock = "*"
+fsspec = {version = ">=2023.1.0,<=2024.6.1", extras = ["http"]}
+huggingface-hub = ">=0.21.2"
 multiprocess = "*"
 numpy = ">=1.17"
 packaging = "*"
 pandas = "*"
-pyarrow = ">=8.0.0"
+pyarrow = ">=15.0.0"
 pyyaml = ">=5.1"
-requests = ">=2.19.0"
-tqdm = ">=4.62.1"
+requests = ">=2.32.2"
+tqdm = ">=4.66.3"
 xxhash = "*"
 
 [package.extras]
-apache-beam = ["apache-beam (>=2.26.0,<2.44.0)"]
-audio = ["librosa", "soundfile (>=0.12.1)"]
+apache-beam = ["apache-beam (>=2.26.0)"]
+audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"]
 benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"]
-dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "black (>=23.1,<24.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "pyyaml (>=5.3.1)", "rarfile (>=4.0)", "ruff (>=0.0.241)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"]
-docs = ["s3fs", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos", "torch", "transformers"]
-jax = ["jax (>=0.2.8,!=0.3.2,<=0.3.25)", "jaxlib (>=0.1.65,<=0.3.25)"]
-metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"]
-quality = ["black (>=23.1,<24.0)", "pyyaml (>=5.3.1)", "ruff (>=0.0.241)"]
+dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"]
+docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"]
+jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"]
+metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk (<3.8.2)", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"]
+quality = ["ruff (>=0.3.0)"]
 s3 = ["s3fs"]
-tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos"]
-tensorflow-gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"]
-tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"]
+tensorflow = ["tensorflow (>=2.6.0)"]
+tensorflow-gpu = ["tensorflow (>=2.6.0)"]
+tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"]
+tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "typing-extensions (>=4.6.1)", "zstandard"]
 torch = ["torch"]
-vision = ["Pillow (>=6.2.1)"]
+vision = ["Pillow (>=9.4.0)"]
 
 [[package]]
 name = "deprecated"
@@ -407,17 +451,18 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
 
 [[package]]
 name = "dill"
-version = "0.3.7"
+version = "0.3.8"
 description = "serialize all of Python"
 optional = true
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e"},
-    {file = "dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03"},
+    {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"},
+    {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"},
 ]
 
 [package.extras]
 graph = ["objgraph (>=1.7.2)"]
+profile = ["gprof2dot (>=2022.7.29)"]
 
 [[package]]
 name = "diskcache"
@@ -443,13 +488,13 @@ files = [
 
 [[package]]
 name = "exceptiongroup"
-version = "1.2.1"
+version = "1.2.2"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
-    {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
 ]
 
 [package.extras]
@@ -457,19 +502,19 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "filelock"
-version = "3.14.0"
+version = "3.16.1"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.14.0-py3-none-any.whl", hash = "sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f"},
-    {file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"},
+    {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
+    {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
 ]
 
 [package.extras]
-docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
-typing = ["typing-extensions (>=4.8)"]
+docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
+typing = ["typing-extensions (>=4.12.2)"]
 
 [[package]]
 name = "frozenlist"
@@ -559,13 +604,13 @@ files = [
 
 [[package]]
 name = "fsspec"
-version = "2024.6.0"
+version = "2024.6.1"
 description = "File-system specification"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "fsspec-2024.6.0-py3-none-any.whl", hash = "sha256:58d7122eb8a1a46f7f13453187bfea4972d66bf01618d37366521b1998034cee"},
-    {file = "fsspec-2024.6.0.tar.gz", hash = "sha256:f579960a56e6d8038a9efc8f9c77279ec12e6299aa86b0769a7e9c46b94527c2"},
+    {file = "fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e"},
+    {file = "fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49"},
 ]
 
 [package.dependencies]
@@ -601,17 +646,17 @@ tqdm = ["tqdm"]
 
 [[package]]
 name = "googleapis-common-protos"
-version = "1.63.1"
+version = "1.65.0"
 description = "Common protobufs used in Google APIs"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "googleapis-common-protos-1.63.1.tar.gz", hash = "sha256:c6442f7a0a6b2a80369457d79e6672bb7dcbaab88e0848302497e3ec80780a6a"},
-    {file = "googleapis_common_protos-1.63.1-py2.py3-none-any.whl", hash = "sha256:0e1c2cdfcbc354b76e4a211a35ea35d6926a835cba1377073c4861db904a1877"},
+    {file = "googleapis_common_protos-1.65.0-py2.py3-none-any.whl", hash = "sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63"},
+    {file = "googleapis_common_protos-1.65.0.tar.gz", hash = "sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0"},
 ]
 
 [package.dependencies]
-protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
+protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
 
 [package.extras]
 grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
@@ -635,242 +680,237 @@ testing = ["protobuf (>=4.21.9)"]
 
 [[package]]
 name = "grpcio"
-version = "1.64.1"
+version = "1.67.0"
 description = "HTTP/2-based RPC framework"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "grpcio-1.64.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:55697ecec192bc3f2f3cc13a295ab670f51de29884ca9ae6cd6247df55df2502"},
-    {file = "grpcio-1.64.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:3b64ae304c175671efdaa7ec9ae2cc36996b681eb63ca39c464958396697daff"},
-    {file = "grpcio-1.64.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:bac71b4b28bc9af61efcdc7630b166440bbfbaa80940c9a697271b5e1dabbc61"},
-    {file = "grpcio-1.64.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c024ffc22d6dc59000faf8ad781696d81e8e38f4078cb0f2630b4a3cf231a90"},
-    {file = "grpcio-1.64.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7cd5c1325f6808b8ae31657d281aadb2a51ac11ab081ae335f4f7fc44c1721d"},
-    {file = "grpcio-1.64.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:0a2813093ddb27418a4c99f9b1c223fab0b053157176a64cc9db0f4557b69bd9"},
-    {file = "grpcio-1.64.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2981c7365a9353f9b5c864595c510c983251b1ab403e05b1ccc70a3d9541a73b"},
-    {file = "grpcio-1.64.1-cp310-cp310-win32.whl", hash = "sha256:1262402af5a511c245c3ae918167eca57342c72320dffae5d9b51840c4b2f86d"},
-    {file = "grpcio-1.64.1-cp310-cp310-win_amd64.whl", hash = "sha256:19264fc964576ddb065368cae953f8d0514ecc6cb3da8903766d9fb9d4554c33"},
-    {file = "grpcio-1.64.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:58b1041e7c870bb30ee41d3090cbd6f0851f30ae4eb68228955d973d3efa2e61"},
-    {file = "grpcio-1.64.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bbc5b1d78a7822b0a84c6f8917faa986c1a744e65d762ef6d8be9d75677af2ca"},
-    {file = "grpcio-1.64.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:5841dd1f284bd1b3d8a6eca3a7f062b06f1eec09b184397e1d1d43447e89a7ae"},
-    {file = "grpcio-1.64.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8caee47e970b92b3dd948371230fcceb80d3f2277b3bf7fbd7c0564e7d39068e"},
-    {file = "grpcio-1.64.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73819689c169417a4f978e562d24f2def2be75739c4bed1992435d007819da1b"},
-    {file = "grpcio-1.64.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6503b64c8b2dfad299749cad1b595c650c91e5b2c8a1b775380fcf8d2cbba1e9"},
-    {file = "grpcio-1.64.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1de403fc1305fd96cfa75e83be3dee8538f2413a6b1685b8452301c7ba33c294"},
-    {file = "grpcio-1.64.1-cp311-cp311-win32.whl", hash = "sha256:d4d29cc612e1332237877dfa7fe687157973aab1d63bd0f84cf06692f04c0367"},
-    {file = "grpcio-1.64.1-cp311-cp311-win_amd64.whl", hash = "sha256:5e56462b05a6f860b72f0fa50dca06d5b26543a4e88d0396259a07dc30f4e5aa"},
-    {file = "grpcio-1.64.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:4657d24c8063e6095f850b68f2d1ba3b39f2b287a38242dcabc166453e950c59"},
-    {file = "grpcio-1.64.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:62b4e6eb7bf901719fce0ca83e3ed474ae5022bb3827b0a501e056458c51c0a1"},
-    {file = "grpcio-1.64.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:ee73a2f5ca4ba44fa33b4d7d2c71e2c8a9e9f78d53f6507ad68e7d2ad5f64a22"},
-    {file = "grpcio-1.64.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:198908f9b22e2672a998870355e226a725aeab327ac4e6ff3a1399792ece4762"},
-    {file = "grpcio-1.64.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b9d0acaa8d835a6566c640f48b50054f422d03e77e49716d4c4e8e279665a1"},
-    {file = "grpcio-1.64.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5e42634a989c3aa6049f132266faf6b949ec2a6f7d302dbb5c15395b77d757eb"},
-    {file = "grpcio-1.64.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:b1a82e0b9b3022799c336e1fc0f6210adc019ae84efb7321d668129d28ee1efb"},
-    {file = "grpcio-1.64.1-cp312-cp312-win32.whl", hash = "sha256:55260032b95c49bee69a423c2f5365baa9369d2f7d233e933564d8a47b893027"},
-    {file = "grpcio-1.64.1-cp312-cp312-win_amd64.whl", hash = "sha256:c1a786ac592b47573a5bb7e35665c08064a5d77ab88a076eec11f8ae86b3e3f6"},
-    {file = "grpcio-1.64.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:a011ac6c03cfe162ff2b727bcb530567826cec85eb8d4ad2bfb4bd023287a52d"},
-    {file = "grpcio-1.64.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4d6dab6124225496010bd22690f2d9bd35c7cbb267b3f14e7a3eb05c911325d4"},
-    {file = "grpcio-1.64.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:a5e771d0252e871ce194d0fdcafd13971f1aae0ddacc5f25615030d5df55c3a2"},
-    {file = "grpcio-1.64.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3c1b90ab93fed424e454e93c0ed0b9d552bdf1b0929712b094f5ecfe7a23ad"},
-    {file = "grpcio-1.64.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20405cb8b13fd779135df23fabadc53b86522d0f1cba8cca0e87968587f50650"},
-    {file = "grpcio-1.64.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0cc79c982ccb2feec8aad0e8fb0d168bcbca85bc77b080d0d3c5f2f15c24ea8f"},
-    {file = "grpcio-1.64.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a3a035c37ce7565b8f4f35ff683a4db34d24e53dc487e47438e434eb3f701b2a"},
-    {file = "grpcio-1.64.1-cp38-cp38-win32.whl", hash = "sha256:1257b76748612aca0f89beec7fa0615727fd6f2a1ad580a9638816a4b2eb18fd"},
-    {file = "grpcio-1.64.1-cp38-cp38-win_amd64.whl", hash = "sha256:0a12ddb1678ebc6a84ec6b0487feac020ee2b1659cbe69b80f06dbffdb249122"},
-    {file = "grpcio-1.64.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:75dbbf415026d2862192fe1b28d71f209e2fd87079d98470db90bebe57b33179"},
-    {file = "grpcio-1.64.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e3d9f8d1221baa0ced7ec7322a981e28deb23749c76eeeb3d33e18b72935ab62"},
-    {file = "grpcio-1.64.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:5f8b75f64d5d324c565b263c67dbe4f0af595635bbdd93bb1a88189fc62ed2e5"},
-    {file = "grpcio-1.64.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c84ad903d0d94311a2b7eea608da163dace97c5fe9412ea311e72c3684925602"},
-    {file = "grpcio-1.64.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:940e3ec884520155f68a3b712d045e077d61c520a195d1a5932c531f11883489"},
-    {file = "grpcio-1.64.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f10193c69fc9d3d726e83bbf0f3d316f1847c3071c8c93d8090cf5f326b14309"},
-    {file = "grpcio-1.64.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac15b6c2c80a4d1338b04d42a02d376a53395ddf0ec9ab157cbaf44191f3ffdd"},
-    {file = "grpcio-1.64.1-cp39-cp39-win32.whl", hash = "sha256:03b43d0ccf99c557ec671c7dede64f023c7da9bb632ac65dbc57f166e4970040"},
-    {file = "grpcio-1.64.1-cp39-cp39-win_amd64.whl", hash = "sha256:ed6091fa0adcc7e4ff944090cf203a52da35c37a130efa564ded02b7aff63bcd"},
-    {file = "grpcio-1.64.1.tar.gz", hash = "sha256:8d51dd1c59d5fa0f34266b80a3805ec29a1f26425c2a54736133f6d87fc4968a"},
+    {file = "grpcio-1.67.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:bd79929b3bb96b54df1296cd3bf4d2b770bd1df6c2bdf549b49bab286b925cdc"},
+    {file = "grpcio-1.67.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:16724ffc956ea42967f5758c2f043faef43cb7e48a51948ab593570570d1e68b"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:2b7183c80b602b0ad816315d66f2fb7887614ead950416d60913a9a71c12560d"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:efe32b45dd6d118f5ea2e5deaed417d8a14976325c93812dd831908522b402c9"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe89295219b9c9e47780a0f1c75ca44211e706d1c598242249fe717af3385ec8"},
+    {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa8d025fae1595a207b4e47c2e087cb88d47008494db258ac561c00877d4c8f8"},
+    {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f95e15db43e75a534420e04822df91f645664bf4ad21dfaad7d51773c80e6bb4"},
+    {file = "grpcio-1.67.0-cp310-cp310-win32.whl", hash = "sha256:a6b9a5c18863fd4b6624a42e2712103fb0f57799a3b29651c0e5b8119a519d65"},
+    {file = "grpcio-1.67.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6eb68493a05d38b426604e1dc93bfc0137c4157f7ab4fac5771fd9a104bbaa6"},
+    {file = "grpcio-1.67.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:e91d154689639932305b6ea6f45c6e46bb51ecc8ea77c10ef25aa77f75443ad4"},
+    {file = "grpcio-1.67.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cb204a742997277da678611a809a8409657b1398aaeebf73b3d9563b7d154c13"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:ae6de510f670137e755eb2a74b04d1041e7210af2444103c8c95f193340d17ee"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74b900566bdf68241118f2918d312d3bf554b2ce0b12b90178091ea7d0a17b3d"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e95e43447a02aa603abcc6b5e727d093d161a869c83b073f50b9390ecf0fa8"},
+    {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0bb94e66cd8f0baf29bd3184b6aa09aeb1a660f9ec3d85da615c5003154bc2bf"},
+    {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:82e5bd4b67b17c8c597273663794a6a46a45e44165b960517fe6d8a2f7f16d23"},
+    {file = "grpcio-1.67.0-cp311-cp311-win32.whl", hash = "sha256:7fc1d2b9fd549264ae585026b266ac2db53735510a207381be509c315b4af4e8"},
+    {file = "grpcio-1.67.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac11ecb34a86b831239cc38245403a8de25037b448464f95c3315819e7519772"},
+    {file = "grpcio-1.67.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:227316b5631260e0bef8a3ce04fa7db4cc81756fea1258b007950b6efc90c05d"},
+    {file = "grpcio-1.67.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d90cfdafcf4b45a7a076e3e2a58e7bc3d59c698c4f6470b0bb13a4d869cf2273"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:77196216d5dd6f99af1c51e235af2dd339159f657280e65ce7e12c1a8feffd1d"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15c05a26a0f7047f720da41dc49406b395c1470eef44ff7e2c506a47ac2c0591"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3840994689cc8cbb73d60485c594424ad8adb56c71a30d8948d6453083624b52"},
+    {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5a1e03c3102b6451028d5dc9f8591131d6ab3c8a0e023d94c28cb930ed4b5f81"},
+    {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:682968427a63d898759474e3b3178d42546e878fdce034fd7474ef75143b64e3"},
+    {file = "grpcio-1.67.0-cp312-cp312-win32.whl", hash = "sha256:d01793653248f49cf47e5695e0a79805b1d9d4eacef85b310118ba1dfcd1b955"},
+    {file = "grpcio-1.67.0-cp312-cp312-win_amd64.whl", hash = "sha256:985b2686f786f3e20326c4367eebdaed3e7aa65848260ff0c6644f817042cb15"},
+    {file = "grpcio-1.67.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c9a35b8bc50db35ab8e3e02a4f2a35cfba46c8705c3911c34ce343bd777813a"},
+    {file = "grpcio-1.67.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:42199e704095b62688998c2d84c89e59a26a7d5d32eed86d43dc90e7a3bd04aa"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:c4c425f440fb81f8d0237c07b9322fc0fb6ee2b29fbef5f62a322ff8fcce240d"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:323741b6699cd2b04a71cb38f502db98f90532e8a40cb675393d248126a268af"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:662c8e105c5e5cee0317d500eb186ed7a93229586e431c1bf0c9236c2407352c"},
+    {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:f6bd2ab135c64a4d1e9e44679a616c9bc944547357c830fafea5c3caa3de5153"},
+    {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:2f55c1e0e2ae9bdd23b3c63459ee4c06d223b68aeb1961d83c48fb63dc29bc03"},
+    {file = "grpcio-1.67.0-cp313-cp313-win32.whl", hash = "sha256:fd6bc27861e460fe28e94226e3673d46e294ca4673d46b224428d197c5935e69"},
+    {file = "grpcio-1.67.0-cp313-cp313-win_amd64.whl", hash = "sha256:cf51d28063338608cd8d3cd64677e922134837902b70ce00dad7f116e3998210"},
+    {file = "grpcio-1.67.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:7f200aca719c1c5dc72ab68be3479b9dafccdf03df530d137632c534bb6f1ee3"},
+    {file = "grpcio-1.67.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0892dd200ece4822d72dd0952f7112c542a487fc48fe77568deaaa399c1e717d"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f4d613fbf868b2e2444f490d18af472ccb47660ea3df52f068c9c8801e1f3e85"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c69bf11894cad9da00047f46584d5758d6ebc9b5950c0dc96fec7e0bce5cde9"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9bca3ca0c5e74dea44bf57d27e15a3a3996ce7e5780d61b7c72386356d231db"},
+    {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:014dfc020e28a0d9be7e93a91f85ff9f4a87158b7df9952fe23cc42d29d31e1e"},
+    {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d4ea4509d42c6797539e9ec7496c15473177ce9abc89bc5c71e7abe50fc25737"},
+    {file = "grpcio-1.67.0-cp38-cp38-win32.whl", hash = "sha256:9d75641a2fca9ae1ae86454fd25d4c298ea8cc195dbc962852234d54a07060ad"},
+    {file = "grpcio-1.67.0-cp38-cp38-win_amd64.whl", hash = "sha256:cff8e54d6a463883cda2fab94d2062aad2f5edd7f06ae3ed030f2a74756db365"},
+    {file = "grpcio-1.67.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:62492bd534979e6d7127b8a6b29093161a742dee3875873e01964049d5250a74"},
+    {file = "grpcio-1.67.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eef1dce9d1a46119fd09f9a992cf6ab9d9178b696382439446ca5f399d7b96fe"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f623c57a5321461c84498a99dddf9d13dac0e40ee056d884d6ec4ebcab647a78"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54d16383044e681f8beb50f905249e4e7261dd169d4aaf6e52eab67b01cbbbe2"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2a44e572fb762c668e4812156b81835f7aba8a721b027e2d4bb29fb50ff4d33"},
+    {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:391df8b0faac84d42f5b8dfc65f5152c48ed914e13c522fd05f2aca211f8bfad"},
+    {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfd9306511fdfc623a1ba1dc3bc07fbd24e6cfbe3c28b4d1e05177baa2f99617"},
+    {file = "grpcio-1.67.0-cp39-cp39-win32.whl", hash = "sha256:30d47dbacfd20cbd0c8be9bfa52fdb833b395d4ec32fe5cff7220afc05d08571"},
+    {file = "grpcio-1.67.0-cp39-cp39-win_amd64.whl", hash = "sha256:f55f077685f61f0fbd06ea355142b71e47e4a26d2d678b3ba27248abfe67163a"},
+    {file = "grpcio-1.67.0.tar.gz", hash = "sha256:e090b2553e0da1c875449c8e75073dd4415dd71c9bde6a406240fdf4c0ee467c"},
 ]
 
 [package.extras]
-protobuf = ["grpcio-tools (>=1.64.1)"]
+protobuf = ["grpcio-tools (>=1.67.0)"]
 
 [[package]]
 name = "grpcio-reflection"
-version = "1.62.2"
+version = "1.62.3"
 description = "Standard Protobuf Reflection Service for gRPC"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "grpcio-reflection-1.62.2.tar.gz", hash = "sha256:2dd44806d68d0006636529bda573012b19a42281478c2d051cdaaebb91e2516c"},
-    {file = "grpcio_reflection-1.62.2-py3-none-any.whl", hash = "sha256:68e8dff3617a9afaf7c462c688f7ca62b55323f497c662abf9965f2953508885"},
+    {file = "grpcio-reflection-1.62.3.tar.gz", hash = "sha256:cb84682933c400bddf94dd94f928d1c6570f500b6dd255973d4bfb495b82585f"},
+    {file = "grpcio_reflection-1.62.3-py3-none-any.whl", hash = "sha256:a48ef37df81a3bada78261fc92ef382f061112f989d1312398b945cc69838b9c"},
 ]
 
 [package.dependencies]
-grpcio = ">=1.62.2"
+grpcio = ">=1.62.3"
 protobuf = ">=4.21.6"
 
 [[package]]
 name = "grpcio-status"
-version = "1.62.2"
+version = "1.62.3"
 description = "Status proto mapping for gRPC"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "grpcio-status-1.62.2.tar.gz", hash = "sha256:62e1bfcb02025a1cd73732a2d33672d3e9d0df4d21c12c51e0bbcaf09bab742a"},
-    {file = "grpcio_status-1.62.2-py3-none-any.whl", hash = "sha256:206ddf0eb36bc99b033f03b2c8e95d319f0044defae9b41ae21408e7e0cda48f"},
+    {file = "grpcio-status-1.62.3.tar.gz", hash = "sha256:289bdd7b2459794a12cf95dc0cb727bd4a1742c37bd823f760236c937e53a485"},
+    {file = "grpcio_status-1.62.3-py3-none-any.whl", hash = "sha256:f9049b762ba8de6b1086789d8315846e094edac2c50beaf462338b301a8fd4b8"},
 ]
 
 [package.dependencies]
 googleapis-common-protos = ">=1.5.5"
-grpcio = ">=1.62.2"
+grpcio = ">=1.62.3"
 protobuf = ">=4.21.6"
 
 [[package]]
 name = "grpcio-tools"
-version = "1.62.2"
+version = "1.62.3"
 description = "Protobuf code generator for gRPC"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "grpcio-tools-1.62.2.tar.gz", hash = "sha256:5fd5e1582b678e6b941ee5f5809340be5e0724691df5299aae8226640f94e18f"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-linux_armv7l.whl", hash = "sha256:1679b4903aed2dc5bd8cb22a452225b05dc8470a076f14fd703581efc0740cdb"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:9d41e0e47dd075c075bb8f103422968a65dd0d8dc8613288f573ae91eb1053ba"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:987e774f74296842bbffd55ea8826370f70c499e5b5f71a8cf3103838b6ee9c3"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40cd4eeea4b25bcb6903b82930d579027d034ba944393c4751cdefd9c49e6989"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6746bc823958499a3cf8963cc1de00072962fb5e629f26d658882d3f4c35095"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2ed775e844566ce9ce089be9a81a8b928623b8ee5820f5e4d58c1a9d33dfc5ae"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bdc5dd3f57b5368d5d661d5d3703bcaa38bceca59d25955dff66244dbc987271"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-win32.whl", hash = "sha256:3a8d6f07e64c0c7756f4e0c4781d9d5a2b9cc9cbd28f7032a6fb8d4f847d0445"},
-    {file = "grpcio_tools-1.62.2-cp310-cp310-win_amd64.whl", hash = "sha256:e33b59fb3efdddeb97ded988a871710033e8638534c826567738d3edce528752"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-linux_armv7l.whl", hash = "sha256:472505d030135d73afe4143b0873efe0dcb385bd6d847553b4f3afe07679af00"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:ec674b4440ef4311ac1245a709e87b36aca493ddc6850eebe0b278d1f2b6e7d1"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:184b4174d4bd82089d706e8223e46c42390a6ebac191073b9772abc77308f9fa"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c195d74fe98541178ece7a50dad2197d43991e0f77372b9a88da438be2486f12"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a34d97c62e61bfe9e6cff0410fe144ac8cca2fc979ad0be46b7edf026339d161"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cbb8453ae83a1db2452b7fe0f4b78e4a8dd32be0f2b2b73591ae620d4d784d3d"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4f989e5cebead3ae92c6abf6bf7b19949e1563a776aea896ac5933f143f0c45d"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-win32.whl", hash = "sha256:c48fabe40b9170f4e3d7dd2c252e4f1ff395dc24e49ac15fc724b1b6f11724da"},
-    {file = "grpcio_tools-1.62.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c616d0ad872e3780693fce6a3ac8ef00fc0963e6d7815ce9dcfae68ba0fc287"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-linux_armv7l.whl", hash = "sha256:10cc3321704ecd17c93cf68c99c35467a8a97ffaaed53207e9b2da6ae0308ee1"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:9be84ff6d47fd61462be7523b49d7ba01adf67ce4e1447eae37721ab32464dd8"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:d82f681c9a9d933a9d8068e8e382977768e7779ddb8870fa0cf918d8250d1532"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:04c607029ae3660fb1624ed273811ffe09d57d84287d37e63b5b802a35897329"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72b61332f1b439c14cbd3815174a8f1d35067a02047c32decd406b3a09bb9890"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8214820990d01b52845f9fbcb92d2b7384a0c321b303e3ac614c219dc7d1d3af"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:462e0ab8dd7c7b70bfd6e3195eebc177549ede5cf3189814850c76f9a340d7ce"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-win32.whl", hash = "sha256:fa107460c842e4c1a6266150881694fefd4f33baa544ea9489601810c2210ef8"},
-    {file = "grpcio_tools-1.62.2-cp312-cp312-win_amd64.whl", hash = "sha256:759c60f24c33a181bbbc1232a6752f9b49fbb1583312a4917e2b389fea0fb0f2"},
-    {file = "grpcio_tools-1.62.2-cp37-cp37m-linux_armv7l.whl", hash = "sha256:45db5da2bcfa88f2b86b57ef35daaae85c60bd6754a051d35d9449c959925b57"},
-    {file = "grpcio_tools-1.62.2-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:ab84bae88597133f6ea7a2bdc57b2fda98a266fe8d8d4763652cbefd20e73ad7"},
-    {file = "grpcio_tools-1.62.2-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:7a49bccae1c7d154b78e991885c3111c9ad8c8fa98e91233de425718f47c6139"},
-    {file = "grpcio_tools-1.62.2-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7e439476b29d6dac363b321781a113794397afceeb97dad85349db5f1cb5e9a"},
-    {file = "grpcio_tools-1.62.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ea369c4d1567d1acdf69c8ea74144f4ccad9e545df7f9a4fc64c94fa7684ba3"},
-    {file = "grpcio_tools-1.62.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4f955702dc4b530696375251319d05223b729ed24e8673c2129f7a75d2caefbb"},
-    {file = "grpcio_tools-1.62.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3708a747aa4b6b505727282ca887041174e146ae030ebcadaf4c1d346858df62"},
-    {file = "grpcio_tools-1.62.2-cp37-cp37m-win_amd64.whl", hash = "sha256:2ce149ea55eadb486a7fb75a20f63ef3ac065ee6a0240ed25f3549ce7954c653"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-linux_armv7l.whl", hash = "sha256:58cbb24b3fa6ae35aa9c210fcea3a51aa5fef0cd25618eb4fd94f746d5a9b703"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:6413581e14a80e0b4532577766cf0586de4dd33766a31b3eb5374a746771c07d"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:47117c8a7e861382470d0e22d336e5a91fdc5f851d1db44fa784b9acea190d87"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f1ba79a253df9e553d20319c615fa2b429684580fa042dba618d7f6649ac7e4"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:04a394cf5e51ba9be412eb9f6c482b6270bd81016e033e8eb7d21b8cc28fe8b5"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3c53b221378b035ae2f1881cbc3aca42a6075a8e90e1a342c2f205eb1d1aa6a1"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c384c838b34d1b67068e51b5bbe49caa6aa3633acd158f1ab16b5da8d226bc53"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-win32.whl", hash = "sha256:19ea69e41c3565932aa28a202d1875ec56786aea46a2eab54a3b28e8a27f9517"},
-    {file = "grpcio_tools-1.62.2-cp38-cp38-win_amd64.whl", hash = "sha256:1d768a5c07279a4c461ebf52d0cec1c6ca85c6291c71ec2703fe3c3e7e28e8c4"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-linux_armv7l.whl", hash = "sha256:5b07b5874187e170edfbd7aa2ca3a54ebf3b2952487653e8c0b0d83601c33035"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:d58389fe8be206ddfb4fa703db1e24c956856fcb9a81da62b13577b3a8f7fda7"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:7d8b4e00c3d7237b92260fc18a561cd81f1da82e8be100db1b7d816250defc66"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fe08d2038f2b7c53259b5c49e0ad08c8e0ce2b548d8185993e7ef67e8592cca"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19216e1fb26dbe23d12a810517e1b3fbb8d4f98b1a3fbebeec9d93a79f092de4"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b8574469ecc4ff41d6bb95f44e0297cdb0d95bade388552a9a444db9cd7485cd"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4f6f32d39283ea834a493fccf0ebe9cfddee7577bdcc27736ad4be1732a36399"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-win32.whl", hash = "sha256:76eb459bdf3fb666e01883270beee18f3f11ed44488486b61cd210b4e0e17cc1"},
-    {file = "grpcio_tools-1.62.2-cp39-cp39-win_amd64.whl", hash = "sha256:217c2ee6a7ce519a55958b8622e21804f6fdb774db08c322f4c9536c35fdce7c"},
-]
-
-[package.dependencies]
-grpcio = ">=1.62.2"
+    {file = "grpcio-tools-1.62.3.tar.gz", hash = "sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-win32.whl", hash = "sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-win_amd64.whl", hash = "sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-win32.whl", hash = "sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-win_amd64.whl", hash = "sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-win32.whl", hash = "sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-win_amd64.whl", hash = "sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-win_amd64.whl", hash = "sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-win32.whl", hash = "sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-win_amd64.whl", hash = "sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-win32.whl", hash = "sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-win_amd64.whl", hash = "sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14"},
+]
+
+[package.dependencies]
+grpcio = ">=1.62.3"
 protobuf = ">=4.21.6,<5.0dev"
 setuptools = "*"
 
 [[package]]
 name = "hf-transfer"
-version = "0.1.6"
-description = ""
+version = "0.1.8"
+description = "Speed up file transfers with the Hugging Face Hub."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "hf_transfer-0.1.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:6fd3d61f9229d27def007e53540412507b74ac2fdb1a29985ae0b6a5137749a2"},
-    {file = "hf_transfer-0.1.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b043bb78df1225de043eb041de9d97783fcca14a0bdc1b1d560fc172fc21b648"},
-    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7db60dd18eae4fa6ea157235fb82196cde5313995b396d1b591aad3b790a7f8f"},
-    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:30d31dbab9b5a558cce407b8728e39d87d7af1ef8745ddb90187e9ae0b9e1e90"},
-    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f6b368bddd757efc7af3126ba81f9ac8f9435e2cc00902cb3d64f2be28d8f719"},
-    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa2086d8aefaaa3e144e167324574882004c0cec49bf2d0638ec4b74732d8da0"},
-    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:45d8985a0940bfe1535cb4ca781f5c11e47c83798ef3373ee1f5d57bbe527a9c"},
-    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f42b89735f1cde22f2a795d1f0915741023235666be7de45879e533c7d6010c"},
-    {file = "hf_transfer-0.1.6-cp310-none-win32.whl", hash = "sha256:2d2c4c4613f3ad45b6ce6291e347b2d3ba1b86816635681436567e461cb3c961"},
-    {file = "hf_transfer-0.1.6-cp310-none-win_amd64.whl", hash = "sha256:78b0eed8d8dce60168a46e584b9742b816af127d7e410a713e12c31249195342"},
-    {file = "hf_transfer-0.1.6-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f1d8c172153f9a6cdaecf137612c42796076f61f6bea1072c90ac2e17c1ab6fa"},
-    {file = "hf_transfer-0.1.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2c601996351f90c514a75a0eeb02bf700b1ad1db2d946cbfe4b60b79e29f0b2f"},
-    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e585c808405557d3f5488f385706abb696997bbae262ea04520757e30836d9d"},
-    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec51af1e8cf4268c268bd88932ade3d7ca895a3c661b42493503f02610ae906b"},
-    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d106fdf996332f6df3ed3fab6d6332df82e8c1fb4b20fd81a491ca4d2ab5616a"},
-    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e9c2ee9e9fde5a0319cc0e8ddfea10897482bc06d5709b10a238f1bc2ebcbc0b"},
-    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f394ea32bc7802b061e549d3133efc523b4ae4fd19bf4b74b183ca6066eef94e"},
-    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4282f09902114cd67fca98a1a1bad569a44521a8395fedf327e966714f68b977"},
-    {file = "hf_transfer-0.1.6-cp311-none-win32.whl", hash = "sha256:276dbf307d5ab6f1bcbf57b5918bfcf9c59d6848ccb28242349e1bb5985f983b"},
-    {file = "hf_transfer-0.1.6-cp311-none-win_amd64.whl", hash = "sha256:fa475175c51451186bea804471995fa8e7b2a48a61dcca55534911dc25955527"},
-    {file = "hf_transfer-0.1.6-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:23d157a67acfa00007799323a1c441b2bbacc7dee625b016b7946fe0e25e6c89"},
-    {file = "hf_transfer-0.1.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6067342a2864b988f861cd2d31bd78eb1e84d153a3f6df38485b6696d9ad3013"},
-    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91cfcb3070e205b58fa8dc8bcb6a62ccc40913fcdb9cd1ff7c364c8e3aa85345"},
-    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb76064ac5165d5eeaaf8d0903e8bf55477221ecc2a4a4d69f0baca065ab905b"},
-    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dabd3a177d83028f164984cf4dd859f77ec1e20c97a6f307ff8fcada0785ef1"},
-    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d0bf4254e44f64a26e0a5b73b5d7e8d91bb36870718fb4f8e126ec943ff4c805"},
-    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d32c1b106f38f336ceb21531f4db9b57d777b9a33017dafdb6a5316388ebe50"},
-    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff05aba3c83921e5c7635ba9f07c693cc893350c447644824043aeac27b285f5"},
-    {file = "hf_transfer-0.1.6-cp312-none-win32.whl", hash = "sha256:051ef0c55607652cb5974f59638da035773254b9a07d7ee5b574fe062de4c9d1"},
-    {file = "hf_transfer-0.1.6-cp312-none-win_amd64.whl", hash = "sha256:716fb5c574fcbdd8092ce73f9b6c66f42e3544337490f77c60ec07df02bd081b"},
-    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c0c981134a55965e279cb7be778c1ccaf93f902fc9ebe31da4f30caf824cc4d"},
-    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ef1f145f04c5b573915bcb1eb5db4039c74f6b46fce73fc473c4287e613b623"},
-    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0a7609b004db3347dbb7796df45403eceb171238210d054d93897d6d84c63a4"},
-    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60f0864bf5996773dbd5f8ae4d1649041f773fe9d5769f4c0eeb5553100acef3"},
-    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d01e55d630ffe70a4f5d0ed576a04c6a48d7c65ca9a7d18f2fca385f20685a9"},
-    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d855946c5062b665190de15b2bdbd4c8eddfee35350bfb7564592e23d36fbbd3"},
-    {file = "hf_transfer-0.1.6-cp37-none-win32.whl", hash = "sha256:fd40b2409cfaf3e8aba20169ee09552f69140e029adeec261b988903ff0c8f6f"},
-    {file = "hf_transfer-0.1.6-cp37-none-win_amd64.whl", hash = "sha256:0e0eba49d46d3b5481919aea0794aec625fbc6ecdf13fe7e0e9f3fc5d5ad5971"},
-    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e669fecb29fc454449739f9f53ed9253197e7c19e6a6eaa0f08334207af4287"},
-    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:89f701802892e5eb84f89f402686861f87dc227d6082b05f4e9d9b4e8015a3c3"},
-    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6f2b0c8b95b01409275d789a9b74d5f2e146346f985d384bf50ec727caf1ccc"},
-    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa855a2fa262792a230f9efcdb5da6d431b747d1861d2a69fe7834b19aea077e"},
-    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa8ca349afb2f0713475426946261eb2035e4efb50ebd2c1d5ad04f395f4217"},
-    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01255f043996bc7d1bae62d8afc5033a90c7e36ce308b988eeb84afe0a69562f"},
-    {file = "hf_transfer-0.1.6-cp38-none-win32.whl", hash = "sha256:60b1db183e8a7540cd4f8b2160ff4de55f77cb0c3fc6a10be1e7c30eb1b2bdeb"},
-    {file = "hf_transfer-0.1.6-cp38-none-win_amd64.whl", hash = "sha256:fb8be3cba6aaa50ab2e9dffbd25c8eb2046785eeff642cf0cdd0dd9ae6be3539"},
-    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d09af35e3e3f09b664e6429e9a0dc200f29c5bdfd88bdd9666de51183b1fe202"},
-    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a4505bd707cc14d85c800f961fad8ca76f804a8ad22fbb7b1a217d8d0c15e6a5"},
-    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c453fd8b0be9740faa23cecd1f28ee9ead7d900cefa64ff836960c503a744c9"},
-    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:13cb8884e718a78c3b81a8cdec9c7ac196dd42961fce55c3ccff3dd783e5ad7a"},
-    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39cd39df171a2b5404de69c4e6cd14eee47f6fe91c1692f939bfb9e59a0110d8"},
-    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ff0629ee9f98df57a783599602eb498f9ec3619dc69348b12e4d9d754abf0e9"},
-    {file = "hf_transfer-0.1.6-cp39-none-win32.whl", hash = "sha256:164a6ce445eb0cc7c645f5b6e1042c003d33292520c90052b6325f30c98e4c5f"},
-    {file = "hf_transfer-0.1.6-cp39-none-win_amd64.whl", hash = "sha256:11b8b4b73bf455f13218c5f827698a30ae10998ca31b8264b51052868c7a9f11"},
-    {file = "hf_transfer-0.1.6-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16957ba057376a99ea361074ce1094f61b58e769defa6be2422ae59c0b6a6530"},
-    {file = "hf_transfer-0.1.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7db952112e3b8ee1a5cbf500d2443e9ce4fb893281c5310a3e31469898628005"},
-    {file = "hf_transfer-0.1.6-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d39d826a7344f5e39f438d62632acd00467aa54a083b66496f61ef67a9885a56"},
-    {file = "hf_transfer-0.1.6-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e2653fbfa92e7651db73d99b697c8684e7345c479bd6857da80bed6138abb2"},
-    {file = "hf_transfer-0.1.6-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:144277e6a86add10b90ec3b583253aec777130312256bfc8d5ade5377e253807"},
-    {file = "hf_transfer-0.1.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb53bcd16365313b2aa0dbdc28206f577d70770f31249cdabc387ac5841edcc"},
-    {file = "hf_transfer-0.1.6-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:990d73a5a68d8261980f146c51f4c5f9995314011cb225222021ad7c39f3af2d"},
-    {file = "hf_transfer-0.1.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:652406037029ab9b4097b4c5f29321bad5f64c2b46fbff142509d918aec87c29"},
-    {file = "hf_transfer-0.1.6.tar.gz", hash = "sha256:deb505a7d417d7055fd7b3549eadb91dfe782941261f3344025c486c16d1d2f9"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:70858f9e94286738ed300484a45beb5cfee6a7ddac4c5886f9c6fce7823ac5ab"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:38adc73f0a8526319d90f7cc5dc2d5e4bb66f487a513d94b98aa6725be732e4a"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44d2f0c08198d8d899fe9d66e86aee2dd844bd7ce33888f261373fcec81d2a54"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1de2a4ef36f9e60b3d3bec00193c0aafd75771709f2ca51b9b162373f5af3d32"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e319269e3606a5ff2979296841766649ac73598a4a8eee2a968f86c8071fea5a"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f6026cf3be6a53ea42f92172f60c1c0675baaa9073f865e671b661dde5fd157"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f865c33ada5bd3650c2b46e59979f2d7755c3f517f8d0facc78576a0c7d26406"},
+    {file = "hf_transfer-0.1.8-cp310-none-win32.whl", hash = "sha256:2054730e8d8ed21917c64be7199e06424b2bd08df1c43a72766afaed7992f2d3"},
+    {file = "hf_transfer-0.1.8-cp310-none-win_amd64.whl", hash = "sha256:2b4f1a9446ba31170b5b1eca4e916504d18378a6b5fe959896bdac8a736a5ecb"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e27c15fcc5869ad7e52bbc0bdec6106b288d1c463f8d2da92f28615a3b181361"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:871a0032d011ebc6409a73a8406b98b84ff2cd3ed7d9e1af8cdf4d660b9fab9b"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:686fa756e1e0214bb6327d33c66732c52274d94a8460beb50604ad988b391cf6"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:36a03b1b2911b0cf15b1b9d971a34b32dadcc4f2fd979aaff5979d6ce4017c34"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:079db90c81f41f4cf3227dfaaa855a9b8e9aef45bc7c2be29ce7232cd83ff881"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac08a4524127fdd14c234d4bcbe49d1c498acf5335c781714823179bcc8dc039"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:837432e73cb17274a6782b6216e8ce058aa325a475dc44a5a6a753d48b86d18a"},
+    {file = "hf_transfer-0.1.8-cp311-none-win32.whl", hash = "sha256:b180f9823dde35aba9bc0f1d0c04ac8a873baebd3732a7ffe4f11940abc7df0d"},
+    {file = "hf_transfer-0.1.8-cp311-none-win_amd64.whl", hash = "sha256:37907d2135cebcf8b6d419bb575148d89c224f16b69357f027bd29d0e85c6529"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:baf948f4f493949309cbe60529620b9b0aef854a22b6e526753364acc57c09b6"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bce5c8bdefa478c5d5eaa646cc4ce1df5cfe764d98572ad0c6b8773e98d49f6"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54d6f8a1a86128d651a3799e1267c343d60f81f2c565d7c5416eb8e674e4cf0e"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f79fd1b0c2ed93efb4c5f684118d7a762ecdd218e170df8208c4e13d3dcd4959"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:414df35692670683bf5623498ef9d88a8df5d77e9516515da6e2b34d1054c11f"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c9798d5f951f66b96d40a7a53910260cb5874fda56cf5944dddb7c571f37ec3"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:060c661691f85a61392e57579c80eb64b5ee277434e81fb582f605c1c8ff05d5"},
+    {file = "hf_transfer-0.1.8-cp312-none-win32.whl", hash = "sha256:f7840e32379820c3e1571a480238e05ea043e970c99d2e999578004a2eb17788"},
+    {file = "hf_transfer-0.1.8-cp312-none-win_amd64.whl", hash = "sha256:9a3204ec423cc5e659872e8179f8704ad9ce2abb1e6a991f8838aedf1dc07830"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09949e86ad63ee139e463fd0dfaf401515ae70445854199f61d545514c65f744"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bf1a74552845b93ea972e6e7131ef54e56056aa54137e93a40faf3fbcb2442ff"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:959bcb3afb4ee6f2a07031a947dba98ec0b64c001bc914fbd8fc32e13a287162"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e01eecdb8162bd61dab9090fbd9f8034dd8b5755ef727a21ca8a057f80cb91ee"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50650a38e9d31f5ad8f010e4598bf304ecd99c17162e7d93f67e031571b864ee"},
+    {file = "hf_transfer-0.1.8-cp37-none-win32.whl", hash = "sha256:e29b9d1d378138f2f4eae0e93ca94af3b5d45f4532eef69f1ab97fe06f9c9d9e"},
+    {file = "hf_transfer-0.1.8-cp37-none-win_amd64.whl", hash = "sha256:cfd6cef43ae883103117a371f8ebae4e7f9637bc6fb480f1be5568e2fe22a8a7"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92a68f7a0043cca8a0de4decc760dca177530944cbab502afac503bd1b2fa01a"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e3138e408179f80a5480598e32f8e1abb564915cbde4d3bc8da52811c75dc3ea"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4544d148930ad34442d43b8fa911c8479c04a95b858b1d1f91e0b7da77082fad"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a851794b9f029965664f8c3002c957fccf21685e9397ceb4f9f19c986dee8ad3"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:791aaf87c5319ac83edb6ab2994b3db19924c49d6ff667dd3d8a610b455ff70a"},
+    {file = "hf_transfer-0.1.8-cp38-none-win32.whl", hash = "sha256:8f71e5d35d3a3160dcca12fdcc8119033aeacaa6a32838a7ad9f9cb1008bbe58"},
+    {file = "hf_transfer-0.1.8-cp38-none-win_amd64.whl", hash = "sha256:543287b4ceb1e25501580b99690f7f0df9d3631d29306f37cbd97e918c732944"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:7ce02a18bd0bb2343e707ac85b68c946bc37623ee24150c69158f6b2b2c7a98f"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:64d7f8dbd64ba183ed1df75d47c84e075ff666ceaa335bff1de16b09eaac5b80"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e7858694e11419ae27e542fb8fc0d0e54d46ff7768fe73bc359d70b8f5aa578"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bed116cd9d1edfa32c0136d7cb8e5f1afd2b32df43c49085d428f108fc8e1c8f"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e385d0da9c6b3472ab29285d2d46c9f9903205b8d108f88a82f3f85aafae0ab"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98f75fa4b86ef15433cd907807ac77d1fb39d7e7b790bfd39c7ae9c385bf0200"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1a63ad947d2901425ac0a3ed70c3696dfde27fadb0482ed763bdd5cc946b278"},
+    {file = "hf_transfer-0.1.8-cp39-none-win32.whl", hash = "sha256:3e74096915813ae842ea6a5bdf10c0fef960aa51a35a560955b3e61cdfe3db57"},
+    {file = "hf_transfer-0.1.8-cp39-none-win_amd64.whl", hash = "sha256:05ea16307bf4a5eb097cbc6e5057e4eb5e080a138af23ef639fd38857723c288"},
+    {file = "hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:928ff036c3e98e10dcfbdb4fcdfc4592d37a5cc8e365a7ba8dfd4337e849d675"},
+    {file = "hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d49ba3ce67035f460ae1924fe2feafec155cb535eec7f31ed5109c19064cd294"},
+    {file = "hf_transfer-0.1.8-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b01f5872c62cfee3ec9ca5c738818296f69f8adf84b4d8d15f2a5601d9dda339"},
+    {file = "hf_transfer-0.1.8-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:659d4212d50847a5165666bf43d67727679b4f694ef9c413613cc27093136527"},
+    {file = "hf_transfer-0.1.8.tar.gz", hash = "sha256:26d229468152e7a3ec12664cac86b8c2800695fd85f9c9a96677a775cc04f0b3"},
 ]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.23.2"
+version = "0.23.5"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.23.2-py3-none-any.whl", hash = "sha256:48727a16e704d409c4bb5913613308499664f22a99743435dc3a13b23c485827"},
-    {file = "huggingface_hub-0.23.2.tar.gz", hash = "sha256:f6829b62d5fdecb452a76fdbec620cba4c1573655a8d710c1df71735fd9edbd2"},
+    {file = "huggingface_hub-0.23.5-py3-none-any.whl", hash = "sha256:d7a7d337615e11a45cc14a0ce5a605db6b038dc24af42866f731684825226e90"},
+    {file = "huggingface_hub-0.23.5.tar.gz", hash = "sha256:67a9caba79b71235be3752852ca27da86bd54311d2424ca8afdb8dda056edf98"},
 ]
 
 [package.dependencies]
@@ -898,15 +938,18 @@ typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "t
 
 [[package]]
 name = "idna"
-version = "3.7"
+version = "3.10"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.6"
 files = [
-    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
-    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
+    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
+    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
 ]
 
+[package.extras]
+all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
+
 [[package]]
 name = "importlib-metadata"
 version = "7.1.0"
@@ -937,20 +980,6 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
-[[package]]
-name = "intel-openmp"
-version = "2021.4.0"
-description = "Intel OpenMP* Runtime Library"
-optional = true
-python-versions = "*"
-files = [
-    {file = "intel_openmp-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:41c01e266a7fdb631a7609191709322da2bbf24b252ba763f125dd651bcc7675"},
-    {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:3b921236a38384e2016f0f3d65af6732cf2c12918087128a9163225451e776f2"},
-    {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:e2240ab8d01472fed04f3544a878cda5da16c26232b7ea1b59132dbfb48b186e"},
-    {file = "intel_openmp-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:6e863d8fd3d7e8ef389d52cf97a50fe2afe1a19247e8c0d168ce021546f96fc9"},
-    {file = "intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:eef4c8bcc8acefd7f5cd3b9384dbf73d59e2c99fc56545712ded913f43c4a94f"},
-]
-
 [[package]]
 name = "interegular"
 version = "0.3.3"
@@ -992,13 +1021,13 @@ files = [
 
 [[package]]
 name = "jsonschema"
-version = "4.22.0"
+version = "4.23.0"
 description = "An implementation of JSON Schema validation for Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "jsonschema-4.22.0-py3-none-any.whl", hash = "sha256:ff4cfd6b1367a40e7bc6411caec72effadd3db0bbe5017de188f2d6108335802"},
-    {file = "jsonschema-4.22.0.tar.gz", hash = "sha256:5b22d434a45935119af990552c862e5d6d564e8f6601206b305a61fdf661a2b7"},
+    {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"},
+    {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"},
 ]
 
 [package.dependencies]
@@ -1009,17 +1038,17 @@ rpds-py = ">=0.7.1"
 
 [package.extras]
 format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
-format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"]
 
 [[package]]
 name = "jsonschema-specifications"
-version = "2023.12.1"
+version = "2024.10.1"
 description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "jsonschema_specifications-2023.12.1-py3-none-any.whl", hash = "sha256:87e4fdf3a94858b8a2ba2778d9ba57d8a9cafca7c7489c46ba0d30a8bc6a9c3c"},
-    {file = "jsonschema_specifications-2023.12.1.tar.gz", hash = "sha256:48a76787b3e70f5ed53f1160d2b81f586e4ca6d1548c5de7085d1682674764cc"},
+    {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"},
+    {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"},
 ]
 
 [package.dependencies]
@@ -1027,13 +1056,13 @@ referencing = ">=0.31.0"
 
 [[package]]
 name = "lark"
-version = "1.1.9"
+version = "1.2.2"
 description = "a modern parsing library"
 optional = true
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "lark-1.1.9-py3-none-any.whl", hash = "sha256:a0dd3a87289f8ccbb325901e4222e723e7d745dbfc1803eaf5f3d2ace19cf2db"},
-    {file = "lark-1.1.9.tar.gz", hash = "sha256:15fa5236490824c2c4aba0e22d2d6d823575dcaf4cdd1848e34b6ad836240fba"},
+    {file = "lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c"},
+    {file = "lark-1.2.2.tar.gz", hash = "sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80"},
 ]
 
 [package.extras]
@@ -1044,32 +1073,32 @@ regex = ["regex"]
 
 [[package]]
 name = "llvmlite"
-version = "0.42.0"
+version = "0.43.0"
 description = "lightweight wrapper around basic LLVM functionality"
 optional = true
 python-versions = ">=3.9"
 files = [
-    {file = "llvmlite-0.42.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3366938e1bf63d26c34fbfb4c8e8d2ded57d11e0567d5bb243d89aab1eb56098"},
-    {file = "llvmlite-0.42.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c35da49666a21185d21b551fc3caf46a935d54d66969d32d72af109b5e7d2b6f"},
-    {file = "llvmlite-0.42.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70f44ccc3c6220bd23e0ba698a63ec2a7d3205da0d848804807f37fc243e3f77"},
-    {file = "llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:763f8d8717a9073b9e0246998de89929071d15b47f254c10eef2310b9aac033d"},
-    {file = "llvmlite-0.42.0-cp310-cp310-win_amd64.whl", hash = "sha256:8d90edf400b4ceb3a0e776b6c6e4656d05c7187c439587e06f86afceb66d2be5"},
-    {file = "llvmlite-0.42.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ae511caed28beaf1252dbaf5f40e663f533b79ceb408c874c01754cafabb9cbf"},
-    {file = "llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81e674c2fe85576e6c4474e8c7e7aba7901ac0196e864fe7985492b737dbab65"},
-    {file = "llvmlite-0.42.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb3975787f13eb97629052edb5017f6c170eebc1c14a0433e8089e5db43bcce6"},
-    {file = "llvmlite-0.42.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5bece0cdf77f22379f19b1959ccd7aee518afa4afbd3656c6365865f84903f9"},
-    {file = "llvmlite-0.42.0-cp311-cp311-win_amd64.whl", hash = "sha256:7e0c4c11c8c2aa9b0701f91b799cb9134a6a6de51444eff5a9087fc7c1384275"},
-    {file = "llvmlite-0.42.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:08fa9ab02b0d0179c688a4216b8939138266519aaa0aa94f1195a8542faedb56"},
-    {file = "llvmlite-0.42.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b2fce7d355068494d1e42202c7aff25d50c462584233013eb4470c33b995e3ee"},
-    {file = "llvmlite-0.42.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebe66a86dc44634b59a3bc860c7b20d26d9aaffcd30364ebe8ba79161a9121f4"},
-    {file = "llvmlite-0.42.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47494552559e00d81bfb836cf1c4d5a5062e54102cc5767d5aa1e77ccd2505c"},
-    {file = "llvmlite-0.42.0-cp312-cp312-win_amd64.whl", hash = "sha256:05cb7e9b6ce69165ce4d1b994fbdedca0c62492e537b0cc86141b6e2c78d5888"},
-    {file = "llvmlite-0.42.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bdd3888544538a94d7ec99e7c62a0cdd8833609c85f0c23fcb6c5c591aec60ad"},
-    {file = "llvmlite-0.42.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0936c2067a67fb8816c908d5457d63eba3e2b17e515c5fe00e5ee2bace06040"},
-    {file = "llvmlite-0.42.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a78ab89f1924fc11482209f6799a7a3fc74ddc80425a7a3e0e8174af0e9e2301"},
-    {file = "llvmlite-0.42.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7599b65c7af7abbc978dbf345712c60fd596aa5670496561cc10e8a71cebfb2"},
-    {file = "llvmlite-0.42.0-cp39-cp39-win_amd64.whl", hash = "sha256:43d65cc4e206c2e902c1004dd5418417c4efa6c1d04df05c6c5675a27e8ca90e"},
-    {file = "llvmlite-0.42.0.tar.gz", hash = "sha256:f92b09243c0cc3f457da8b983f67bd8e1295d0f5b3746c7a1861d7a99403854a"},
+    {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
+    {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
+    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead"},
+    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a"},
+    {file = "llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed"},
+    {file = "llvmlite-0.43.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3e8d0618cb9bfe40ac38a9633f2493d4d4e9fcc2f438d39a4e854f39cc0f5f98"},
+    {file = "llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0a9a1a39d4bf3517f2af9d23d479b4175ead205c592ceeb8b89af48a327ea57"},
+    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1da416ab53e4f7f3bc8d4eeba36d801cc1894b9fbfbf2022b29b6bad34a7df2"},
+    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977525a1e5f4059316b183fb4fd34fa858c9eade31f165427a3977c95e3ee749"},
+    {file = "llvmlite-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:d5bd550001d26450bd90777736c69d68c487d17bf371438f975229b2b8241a91"},
+    {file = "llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f99b600aa7f65235a5a05d0b9a9f31150c390f31261f2a0ba678e26823ec38f7"},
+    {file = "llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:35d80d61d0cda2d767f72de99450766250560399edc309da16937b93d3b676e7"},
+    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eccce86bba940bae0d8d48ed925f21dbb813519169246e2ab292b5092aba121f"},
+    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6509e1507ca0760787a199d19439cc887bfd82226f5af746d6977bd9f66844"},
+    {file = "llvmlite-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a2872ee80dcf6b5dbdc838763d26554c2a18aa833d31a2635bff16aafefb9c9"},
+    {file = "llvmlite-0.43.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cd2a7376f7b3367019b664c21f0c61766219faa3b03731113ead75107f3b66c"},
+    {file = "llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18e9953c748b105668487b7c81a3e97b046d8abf95c4ddc0cd3c94f4e4651ae8"},
+    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74937acd22dc11b33946b67dca7680e6d103d6e90eeaaaf932603bec6fe7b03a"},
+    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9efc739cc6ed760f795806f67889923f7274276f0eb45092a1473e40d9b867"},
+    {file = "llvmlite-0.43.0-cp39-cp39-win_amd64.whl", hash = "sha256:47e147cdda9037f94b399bf03bfd8a6b6b1f2f90be94a454e3386f006455a9b4"},
+    {file = "llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5"},
 ]
 
 [[package]]
@@ -1090,92 +1119,254 @@ win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
 [package.extras]
 dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "isort (>=5.1.1)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)", "tox (>=3.9.0)"]
 
+[[package]]
+name = "markdown-it-py"
+version = "3.0.0"
+description = "Python port of markdown-it. Markdown parsing, done right!"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
+    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
+]
+
+[package.dependencies]
+mdurl = ">=0.1,<1.0"
+
+[package.extras]
+benchmarking = ["psutil", "pytest", "pytest-benchmark"]
+code-style = ["pre-commit (>=3.0,<4.0)"]
+compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
+linkify = ["linkify-it-py (>=1,<3)"]
+plugins = ["mdit-py-plugins"]
+profiling = ["gprof2dot"]
+rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
+testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
+
 [[package]]
 name = "markupsafe"
-version = "2.1.5"
+version = "3.0.2"
 description = "Safely add untrusted strings to HTML/XML markup."
 optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"},
+    {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
+]
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.1"
+description = "Marlin quantization kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:705c89ed54977099a40b37dc0c796964649024f1a8819a1832118cd7b146efe1"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.1"
+description = "Marlin quantization kernels"
+optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"},
-    {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
-]
-
-[[package]]
-name = "mkl"
-version = "2021.4.0"
-description = "Intel® oneAPI Math Kernel Library"
+    {file = "marlin_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:e1f3d123eca643149d0a4f6b81c4405d78abb3a694a78fccc8670a25b3404406"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.1"
+description = "Marlin quantization kernels"
 optional = true
-python-versions = "*"
+python-versions = ">=3.7"
+files = [
+    {file = "marlin_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:9d68367fd5e1caf2edc90b77ad5d074b11586012265a3147ecca1f1171ae22f8"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.1"
+description = "Marlin quantization kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:d962277c5f7642972e298650913dd0546b9f735b706dc88bb34955b3cac7f330"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+description = "Markdown URL utilities"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
+    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
+]
+
+[[package]]
+name = "moe-kernels"
+version = "0.6.0"
+description = "MoE kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "moe_kernels-0.6.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:f28fd2a56c3ac7bfe74bc44cc7c8c0791a2644ad689b084ea4ed6decb7f41c25"},
+]
+
+[package.dependencies]
+nvidia-ml-py = "*"
+torch = "*"
+triton = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+
+[[package]]
+name = "moe-kernels"
+version = "0.6.0"
+description = "MoE kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "moe_kernels-0.6.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:db475948fd9f7a8647aa3f73256ff4d3bb111425305bcd0b0d3559ccc75b8937"},
+]
+
+[package.dependencies]
+nvidia-ml-py = "*"
+torch = "*"
+triton = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+
+[[package]]
+name = "moe-kernels"
+version = "0.6.0"
+description = "MoE kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "moe_kernels-0.6.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:364be07c06aafbab1f51d9e26d9a4ff658defe1462a4c645abaf7b895ed163a8"},
+]
+
+[package.dependencies]
+nvidia-ml-py = "*"
+torch = "*"
+triton = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+
+[[package]]
+name = "moe-kernels"
+version = "0.6.0"
+description = "MoE kernels"
+optional = true
+python-versions = ">=3.7"
 files = [
-    {file = "mkl-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:67460f5cd7e30e405b54d70d1ed3ca78118370b65f7327d495e9c8847705e2fb"},
-    {file = "mkl-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:636d07d90e68ccc9630c654d47ce9fdeb036bb46e2b193b3a9ac8cfea683cce5"},
-    {file = "mkl-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:398dbf2b0d12acaf54117a5210e8f191827f373d362d796091d161f610c1ebfb"},
-    {file = "mkl-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:439c640b269a5668134e3dcbcea4350459c4a8bc46469669b2d67e07e3d330e8"},
-    {file = "mkl-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:ceef3cafce4c009dd25f65d7ad0d833a0fbadc3d8903991ec92351fe5de1e718"},
+    {file = "moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:81e7fa25fb5ed5336f5151994f5e3f600df7e166fe013576968c59415e442894"},
 ]
 
 [package.dependencies]
-intel-openmp = "==2021.*"
-tbb = "==2021.*"
+nvidia-ml-py = "*"
+torch = "*"
+triton = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
 
 [[package]]
 name = "mpmath"
@@ -1196,130 +1387,131 @@ tests = ["pytest (>=4.6)"]
 
 [[package]]
 name = "multidict"
-version = "6.0.5"
+version = "6.1.0"
 description = "multidict implementation"
 optional = true
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
-    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
-    {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"},
-    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"},
-    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"},
-    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"},
-    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"},
-    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"},
-    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"},
-    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"},
-    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"},
-    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"},
-    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"},
-    {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"},
-    {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"},
-    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"},
-    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"},
-    {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"},
-    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"},
-    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"},
-    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"},
-    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"},
-    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"},
-    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"},
-    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"},
-    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"},
-    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"},
-    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"},
-    {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"},
-    {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"},
-    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"},
-    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"},
-    {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"},
-    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"},
-    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"},
-    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"},
-    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"},
-    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"},
-    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"},
-    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"},
-    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"},
-    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"},
-    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"},
-    {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"},
-    {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"},
-    {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"},
-    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"},
-    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"},
-    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"},
-    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"},
-    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"},
-    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"},
-    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"},
-    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"},
-    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"},
-    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"},
-    {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"},
-    {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"},
-    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"},
-    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"},
-    {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"},
-    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"},
-    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"},
-    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"},
-    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"},
-    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"},
-    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"},
-    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"},
-    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"},
-    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"},
-    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"},
-    {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"},
-    {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"},
-    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"},
-    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"},
-    {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"},
-    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"},
-    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"},
-    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"},
-    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"},
-    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"},
-    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"},
-    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"},
-    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"},
-    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"},
-    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"},
-    {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"},
-    {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"},
-    {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"},
-    {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
+    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"},
+    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"},
+    {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"},
+    {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"},
+    {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"},
+    {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"},
+    {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"},
+    {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"},
+    {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"},
+    {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"},
+    {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"},
+    {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"},
+    {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"},
+    {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"},
+    {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"},
+    {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"},
+    {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"},
 ]
 
+[package.dependencies]
+typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
+
 [[package]]
 name = "multiprocess"
-version = "0.70.15"
+version = "0.70.16"
 description = "better multiprocessing and multithreading in Python"
 optional = true
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "multiprocess-0.70.15-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:aa36c7ed16f508091438687fe9baa393a7a8e206731d321e443745e743a0d4e5"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:20e024018c46d0d1602024c613007ac948f9754659e3853b0aa705e83f6931d8"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_i686.whl", hash = "sha256:e576062981c91f0fe8a463c3d52506e598dfc51320a8dd8d78b987dfca91c5db"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:e73f497e6696a0f5433ada2b3d599ae733b87a6e8b008e387c62ac9127add177"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73db2e7b32dcc7f9b0f075c2ffa45c90b6729d3f1805f27e88534c8d321a1be5"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_i686.whl", hash = "sha256:4271647bd8a49c28ecd6eb56a7fdbd3c212c45529ad5303b40b3c65fc6928e5f"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:cf981fb998d6ec3208cb14f0cf2e9e80216e834f5d51fd09ebc937c32b960902"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:18f9f2c7063346d1617bd1684fdcae8d33380ae96b99427260f562e1a1228b67"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_i686.whl", hash = "sha256:0eac53214d664c49a34695e5824872db4006b1a465edd7459a251809c3773370"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1a51dd34096db47fb21fa2b839e615b051d51b97af9a67afbcdaa67186b44883"},
-    {file = "multiprocess-0.70.15-py310-none-any.whl", hash = "sha256:7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a"},
-    {file = "multiprocess-0.70.15-py311-none-any.whl", hash = "sha256:134f89053d82c9ed3b73edd3a2531eb791e602d4f4156fc92a79259590bd9670"},
-    {file = "multiprocess-0.70.15-py37-none-any.whl", hash = "sha256:f7d4a1629bccb433114c3b4885f69eccc200994323c80f6feee73b0edc9199c5"},
-    {file = "multiprocess-0.70.15-py38-none-any.whl", hash = "sha256:bee9afba476c91f9ebee7beeee0601face9eff67d822e893f9a893725fbd6316"},
-    {file = "multiprocess-0.70.15-py39-none-any.whl", hash = "sha256:3e0953f5d52b4c76f1c973eaf8214554d146f2be5decb48e928e55c7a2d19338"},
-    {file = "multiprocess-0.70.15.tar.gz", hash = "sha256:f20eed3036c0ef477b07a4177cf7c1ba520d9a2677870a4f47fe026f0cd6787e"},
+    {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"},
+    {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"},
+    {file = "multiprocess-0.70.16-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37b55f71c07e2d741374998c043b9520b626a8dddc8b3129222ca4f1a06ef67a"},
+    {file = "multiprocess-0.70.16-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ba8c31889abf4511c7308a8c52bb4a30b9d590e7f58523302ba00237702ca054"},
+    {file = "multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41"},
+    {file = "multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a"},
+    {file = "multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02"},
+    {file = "multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a"},
+    {file = "multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e"},
+    {file = "multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435"},
+    {file = "multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3"},
+    {file = "multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1"},
 ]
 
 [package.dependencies]
-dill = ">=0.3.7"
+dill = ">=0.3.8"
 
 [[package]]
 name = "nest-asyncio"
@@ -1352,37 +1544,37 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
 [[package]]
 name = "numba"
-version = "0.59.1"
+version = "0.60.0"
 description = "compiling Python code using LLVM"
 optional = true
 python-versions = ">=3.9"
 files = [
-    {file = "numba-0.59.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:97385a7f12212c4f4bc28f648720a92514bee79d7063e40ef66c2d30600fd18e"},
-    {file = "numba-0.59.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0b77aecf52040de2a1eb1d7e314497b9e56fba17466c80b457b971a25bb1576d"},
-    {file = "numba-0.59.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3476a4f641bfd58f35ead42f4dcaf5f132569c4647c6f1360ccf18ee4cda3990"},
-    {file = "numba-0.59.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:525ef3f820931bdae95ee5379c670d5c97289c6520726bc6937a4a7d4230ba24"},
-    {file = "numba-0.59.1-cp310-cp310-win_amd64.whl", hash = "sha256:990e395e44d192a12105eca3083b61307db7da10e093972ca285c85bef0963d6"},
-    {file = "numba-0.59.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:43727e7ad20b3ec23ee4fc642f5b61845c71f75dd2825b3c234390c6d8d64051"},
-    {file = "numba-0.59.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:411df625372c77959570050e861981e9d196cc1da9aa62c3d6a836b5cc338966"},
-    {file = "numba-0.59.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2801003caa263d1e8497fb84829a7ecfb61738a95f62bc05693fcf1733e978e4"},
-    {file = "numba-0.59.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dd2842fac03be4e5324ebbbd4d2d0c8c0fc6e0df75c09477dd45b288a0777389"},
-    {file = "numba-0.59.1-cp311-cp311-win_amd64.whl", hash = "sha256:0594b3dfb369fada1f8bb2e3045cd6c61a564c62e50cf1f86b4666bc721b3450"},
-    {file = "numba-0.59.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1cce206a3b92836cdf26ef39d3a3242fec25e07f020cc4feec4c4a865e340569"},
-    {file = "numba-0.59.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8c8b4477763cb1fbd86a3be7050500229417bf60867c93e131fd2626edb02238"},
-    {file = "numba-0.59.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d80bce4ef7e65bf895c29e3889ca75a29ee01da80266a01d34815918e365835"},
-    {file = "numba-0.59.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f7ad1d217773e89a9845886401eaaab0a156a90aa2f179fdc125261fd1105096"},
-    {file = "numba-0.59.1-cp312-cp312-win_amd64.whl", hash = "sha256:5bf68f4d69dd3a9f26a9b23548fa23e3bcb9042e2935257b471d2a8d3c424b7f"},
-    {file = "numba-0.59.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4e0318ae729de6e5dbe64c75ead1a95eb01fabfe0e2ebed81ebf0344d32db0ae"},
-    {file = "numba-0.59.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0f68589740a8c38bb7dc1b938b55d1145244c8353078eea23895d4f82c8b9ec1"},
-    {file = "numba-0.59.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:649913a3758891c77c32e2d2a3bcbedf4a69f5fea276d11f9119677c45a422e8"},
-    {file = "numba-0.59.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9712808e4545270291d76b9a264839ac878c5eb7d8b6e02c970dc0ac29bc8187"},
-    {file = "numba-0.59.1-cp39-cp39-win_amd64.whl", hash = "sha256:8d51ccd7008a83105ad6a0082b6a2b70f1142dc7cfd76deb8c5a862367eb8c86"},
-    {file = "numba-0.59.1.tar.gz", hash = "sha256:76f69132b96028d2774ed20415e8c528a34e3299a40581bae178f0994a2f370b"},
-]
-
-[package.dependencies]
-llvmlite = "==0.42.*"
-numpy = ">=1.22,<1.27"
+    {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
+    {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
+    {file = "numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781"},
+    {file = "numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e"},
+    {file = "numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198"},
+    {file = "numba-0.60.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a17b70fc9e380ee29c42717e8cc0bfaa5556c416d94f9aa96ba13acb41bdece8"},
+    {file = "numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fb02b344a2a80efa6f677aa5c40cd5dd452e1b35f8d1c2af0dfd9ada9978e4b"},
+    {file = "numba-0.60.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f4fde652ea604ea3c86508a3fb31556a6157b2c76c8b51b1d45eb40c8598703"},
+    {file = "numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4142d7ac0210cc86432b818338a2bc368dc773a2f5cf1e32ff7c5b378bd63ee8"},
+    {file = "numba-0.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:cac02c041e9b5bc8cf8f2034ff6f0dbafccd1ae9590dc146b3a02a45e53af4e2"},
+    {file = "numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d7da4098db31182fc5ffe4bc42c6f24cd7d1cb8a14b59fd755bfee32e34b8404"},
+    {file = "numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38d6ea4c1f56417076ecf8fc327c831ae793282e0ff51080c5094cb726507b1c"},
+    {file = "numba-0.60.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62908d29fb6a3229c242e981ca27e32a6e606cc253fc9e8faeb0e48760de241e"},
+    {file = "numba-0.60.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ebaa91538e996f708f1ab30ef4d3ddc344b64b5227b67a57aa74f401bb68b9d"},
+    {file = "numba-0.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:f75262e8fe7fa96db1dca93d53a194a38c46da28b112b8a4aca168f0df860347"},
+    {file = "numba-0.60.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74"},
+    {file = "numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:819a3dfd4630d95fd574036f99e47212a1af41cbcb019bf8afac63ff56834449"},
+    {file = "numba-0.60.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b"},
+    {file = "numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c151748cd269ddeab66334bd754817ffc0cabd9433acb0f551697e5151917d25"},
+    {file = "numba-0.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:3031547a015710140e8c87226b4cfe927cac199835e5bf7d4fe5cb64e814e3ab"},
+    {file = "numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16"},
+]
+
+[package.dependencies]
+llvmlite = "==0.43.*"
+numpy = ">=1.22,<2.1"
 
 [[package]]
 name = "numpy"
@@ -1431,56 +1623,61 @@ files = [
 
 [[package]]
 name = "nvidia-cublas-cu12"
-version = "12.1.3.1"
+version = "12.4.5.8"
 description = "CUBLAS native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"},
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc"},
 ]
 
 [[package]]
 name = "nvidia-cuda-cupti-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "CUDA profiling tools runtime libs."
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"},
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922"},
 ]
 
 [[package]]
 name = "nvidia-cuda-nvrtc-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "NVRTC native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"},
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec"},
 ]
 
 [[package]]
 name = "nvidia-cuda-runtime-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "CUDA Runtime native Libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"},
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e"},
 ]
 
 [[package]]
 name = "nvidia-cudnn-cu12"
-version = "8.9.2.26"
+version = "9.1.0.70"
 description = "cuDNN runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", hash = "sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9"},
+    {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"},
+    {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"},
 ]
 
 [package.dependencies]
@@ -1488,35 +1685,41 @@ nvidia-cublas-cu12 = "*"
 
 [[package]]
 name = "nvidia-cufft-cu12"
-version = "11.0.2.54"
+version = "11.2.1.3"
 description = "CUFFT native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"},
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b"},
 ]
 
+[package.dependencies]
+nvidia-nvjitlink-cu12 = "*"
+
 [[package]]
 name = "nvidia-curand-cu12"
-version = "10.3.2.106"
+version = "10.3.5.147"
 description = "CURAND native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"},
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771"},
 ]
 
 [[package]]
 name = "nvidia-cusolver-cu12"
-version = "11.4.5.107"
+version = "11.6.1.9"
 description = "CUDA solver native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"},
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c"},
 ]
 
 [package.dependencies]
@@ -1526,49 +1729,62 @@ nvidia-nvjitlink-cu12 = "*"
 
 [[package]]
 name = "nvidia-cusparse-cu12"
-version = "12.1.0.106"
+version = "12.3.1.170"
 description = "CUSPARSE native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"},
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f"},
 ]
 
 [package.dependencies]
 nvidia-nvjitlink-cu12 = "*"
 
+[[package]]
+name = "nvidia-ml-py"
+version = "12.560.30"
+description = "Python Bindings for the NVIDIA Management Library"
+optional = true
+python-versions = "*"
+files = [
+    {file = "nvidia-ml-py-12.560.30.tar.gz", hash = "sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97"},
+    {file = "nvidia_ml_py-12.560.30-py3-none-any.whl", hash = "sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73"},
+]
+
 [[package]]
 name = "nvidia-nccl-cu12"
-version = "2.20.5"
+version = "2.21.5"
 description = "NVIDIA Collective Communication Library (NCCL) Runtime"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"},
-    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"},
+    {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"},
 ]
 
 [[package]]
 name = "nvidia-nvjitlink-cu12"
-version = "12.5.40"
+version = "12.4.127"
 description = "Nvidia JIT LTO Library"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"},
-    {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"},
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"},
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
 ]
 
 [[package]]
 name = "nvidia-nvtx-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "NVIDIA Tools Extension"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"},
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
 ]
 
 [[package]]
@@ -1770,51 +1986,64 @@ test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets"
 
 [[package]]
 name = "packaging"
-version = "24.0"
+version = "24.1"
 description = "Core utilities for Python packages"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"},
-    {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"},
+    {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
+    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
 ]
 
 [[package]]
 name = "pandas"
-version = "2.2.2"
+version = "2.2.3"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = true
 python-versions = ">=3.9"
 files = [
-    {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
-    {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
-    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
-    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
-    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
-    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
-    {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
-    {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
-    {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
-    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
-    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
-    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
-    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
-    {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
-    {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
-    {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
-    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
-    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
-    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
-    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
-    {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
-    {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"},
-    {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"},
-    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"},
-    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"},
-    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"},
-    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"},
-    {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"},
-    {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
+    {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
+    {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
+    {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
+    {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
+    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
+    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
+    {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
+    {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
+    {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
+    {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
+    {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
+    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
+    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
+    {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
+    {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
+    {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
+    {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
+    {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
+    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
+    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
+    {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
+    {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
+    {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
+    {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
+    {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
+    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
+    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
+    {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
+    {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
+    {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
+    {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
+    {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
+    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
+    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
+    {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
+    {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
+    {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
+    {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
+    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
+    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
+    {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
+    {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
 ]
 
 [package.dependencies]
@@ -1883,84 +2112,95 @@ test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameter
 
 [[package]]
 name = "pillow"
-version = "10.3.0"
+version = "10.4.0"
 description = "Python Imaging Library (Fork)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"},
-    {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"},
-    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"},
-    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"},
-    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"},
-    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"},
-    {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"},
-    {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"},
-    {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"},
-    {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"},
-    {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"},
-    {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"},
-    {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"},
-    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"},
-    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"},
-    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"},
-    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"},
-    {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"},
-    {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"},
-    {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"},
-    {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"},
-    {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"},
-    {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"},
-    {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"},
-    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"},
-    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"},
-    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"},
-    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"},
-    {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"},
-    {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"},
-    {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"},
-    {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"},
-    {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"},
-    {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"},
-    {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"},
-    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"},
-    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"},
-    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"},
-    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"},
-    {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"},
-    {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"},
-    {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"},
-    {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"},
-    {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"},
-    {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"},
-    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"},
-    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"},
-    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"},
-    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"},
-    {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"},
-    {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"},
-    {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"},
-    {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"},
-    {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"},
-    {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"},
+    {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"},
+    {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"},
+    {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"},
+    {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"},
+    {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"},
+    {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"},
+    {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"},
+    {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"},
+    {file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"},
+    {file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"},
+    {file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"},
+    {file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"},
+    {file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"},
+    {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"},
+    {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"},
+    {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"},
+    {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"},
+    {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"},
+    {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"},
+    {file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"},
+    {file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"},
+    {file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"},
+    {file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"},
+    {file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"},
+    {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"},
+    {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"},
+    {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"},
+    {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"},
+    {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"},
+    {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"},
+    {file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"},
+    {file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"},
+    {file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"},
+    {file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"},
+    {file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"},
+    {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"},
+    {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"},
+    {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"},
+    {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"},
+    {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"},
+    {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"},
+    {file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"},
+    {file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"},
+    {file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"},
+    {file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"},
+    {file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"},
+    {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"},
+    {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"},
+    {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"},
+    {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"},
+    {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"},
+    {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"},
+    {file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"},
+    {file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"},
+    {file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"},
+    {file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"},
+    {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"},
+    {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"},
+    {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"},
+    {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"},
+    {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"},
+    {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"},
+    {file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"},
+    {file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"},
+    {file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"},
+    {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"},
+    {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"},
+    {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"},
+    {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"},
+    {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"},
+    {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"},
+    {file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"},
+    {file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"},
 ]
 
 [package.extras]
-docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
+docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
 fpx = ["olefile"]
 mic = ["olefile"]
 tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
@@ -1996,53 +2236,162 @@ files = [
 [package.extras]
 twisted = ["twisted"]
 
+[[package]]
+name = "propcache"
+version = "0.2.0"
+description = "Accelerated property cache"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
+    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
+    {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"},
+    {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"},
+    {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"},
+    {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"},
+    {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"},
+    {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"},
+    {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"},
+    {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"},
+    {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"},
+    {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"},
+    {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"},
+    {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"},
+    {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"},
+    {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"},
+    {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
+]
+
 [[package]]
 name = "protobuf"
-version = "4.25.3"
+version = "4.25.5"
 description = ""
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "protobuf-4.25.3-cp310-abi3-win32.whl", hash = "sha256:d4198877797a83cbfe9bffa3803602bbe1625dc30d8a097365dbc762e5790faa"},
-    {file = "protobuf-4.25.3-cp310-abi3-win_amd64.whl", hash = "sha256:209ba4cc916bab46f64e56b85b090607a676f66b473e6b762e6f1d9d591eb2e8"},
-    {file = "protobuf-4.25.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f1279ab38ecbfae7e456a108c5c0681e4956d5b1090027c1de0f934dfdb4b35c"},
-    {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:e7cb0ae90dd83727f0c0718634ed56837bfeeee29a5f82a7514c03ee1364c019"},
-    {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:7c8daa26095f82482307bc717364e7c13f4f1c99659be82890dcfc215194554d"},
-    {file = "protobuf-4.25.3-cp38-cp38-win32.whl", hash = "sha256:f4f118245c4a087776e0a8408be33cf09f6c547442c00395fbfb116fac2f8ac2"},
-    {file = "protobuf-4.25.3-cp38-cp38-win_amd64.whl", hash = "sha256:c053062984e61144385022e53678fbded7aea14ebb3e0305ae3592fb219ccfa4"},
-    {file = "protobuf-4.25.3-cp39-cp39-win32.whl", hash = "sha256:19b270aeaa0099f16d3ca02628546b8baefe2955bbe23224aaf856134eccf1e4"},
-    {file = "protobuf-4.25.3-cp39-cp39-win_amd64.whl", hash = "sha256:e3c97a1555fd6388f857770ff8b9703083de6bf1f9274a002a332d65fbb56c8c"},
-    {file = "protobuf-4.25.3-py3-none-any.whl", hash = "sha256:f0700d54bcf45424477e46a9f0944155b46fb0639d69728739c0e47bab83f2b9"},
-    {file = "protobuf-4.25.3.tar.gz", hash = "sha256:25b5d0b42fd000320bd7830b349e3b696435f3b329810427a6bcce6a5492cc5c"},
+    {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"},
+    {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"},
+    {file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"},
+    {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"},
+    {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"},
+    {file = "protobuf-4.25.5-cp38-cp38-win32.whl", hash = "sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1"},
+    {file = "protobuf-4.25.5-cp38-cp38-win_amd64.whl", hash = "sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a"},
+    {file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"},
+    {file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"},
+    {file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"},
+    {file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"},
 ]
 
 [[package]]
 name = "psutil"
-version = "5.9.8"
+version = "6.1.0"
 description = "Cross-platform lib for process and system monitoring in Python."
 optional = true
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
-files = [
-    {file = "psutil-5.9.8-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8"},
-    {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73"},
-    {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7"},
-    {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36"},
-    {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d"},
-    {file = "psutil-5.9.8-cp27-none-win32.whl", hash = "sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e"},
-    {file = "psutil-5.9.8-cp27-none-win_amd64.whl", hash = "sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631"},
-    {file = "psutil-5.9.8-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81"},
-    {file = "psutil-5.9.8-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421"},
-    {file = "psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4"},
-    {file = "psutil-5.9.8-cp36-cp36m-win32.whl", hash = "sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee"},
-    {file = "psutil-5.9.8-cp36-cp36m-win_amd64.whl", hash = "sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2"},
-    {file = "psutil-5.9.8-cp37-abi3-win32.whl", hash = "sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0"},
-    {file = "psutil-5.9.8-cp37-abi3-win_amd64.whl", hash = "sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf"},
-    {file = "psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8"},
-    {file = "psutil-5.9.8.tar.gz", hash = "sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c"},
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+files = [
+    {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
+    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
+    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"},
+    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"},
+    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"},
+    {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"},
+    {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"},
+    {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"},
+    {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"},
+    {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"},
+    {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"},
+    {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"},
+    {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"},
+    {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"},
 ]
 
 [package.extras]
-test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"]
+test = ["pytest", "pytest-xdist", "setuptools"]
 
 [[package]]
 name = "py-cpuinfo"
@@ -2057,162 +2406,190 @@ files = [
 
 [[package]]
 name = "pyarrow"
-version = "16.1.0"
+version = "17.0.0"
 description = "Python library for Apache Arrow"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"},
-    {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"},
-    {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"},
-    {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"},
-    {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"},
-    {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"},
-    {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"},
-    {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"},
-    {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"},
-    {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"},
-    {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"},
-    {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"},
-    {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"},
-    {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"},
-    {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"},
-    {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
+    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
+    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
+    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
+    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
+    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
 ]
 
 [package.dependencies]
 numpy = ">=1.16.6"
 
+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
 [[package]]
 name = "pydantic"
-version = "2.7.3"
+version = "2.9.2"
 description = "Data validation using Python type hints"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic-2.7.3-py3-none-any.whl", hash = "sha256:ea91b002777bf643bb20dd717c028ec43216b24a6001a280f83877fd2655d0b4"},
-    {file = "pydantic-2.7.3.tar.gz", hash = "sha256:c46c76a40bb1296728d7a8b99aa73dd70a48c3510111ff290034f860c99c419e"},
+    {file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"},
+    {file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"},
 ]
 
 [package.dependencies]
-annotated-types = ">=0.4.0"
-pydantic-core = "2.18.4"
-typing-extensions = ">=4.6.1"
+annotated-types = ">=0.6.0"
+pydantic-core = "2.23.4"
+typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
+timezone = ["tzdata"]
 
 [[package]]
 name = "pydantic-core"
-version = "2.18.4"
+version = "2.23.4"
 description = "Core functionality for Pydantic validation and serialization"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"},
-    {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"},
-    {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"},
-    {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"},
-    {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"},
-    {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"},
-    {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"},
-    {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"},
-    {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"},
-    {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"},
-    {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"},
-    {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"},
-    {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"},
-    {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b10bd51f823d891193d4717448fab065733958bdb6a6b351967bd349d48d5c9b"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4fc714bdbfb534f94034efaa6eadd74e5b93c8fa6315565a222f7b6f42ca1166"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63e46b3169866bd62849936de036f901a9356e36376079b05efa83caeaa02ceb"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed1a53de42fbe34853ba90513cea21673481cd81ed1be739f7f2efb931b24916"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cfdd16ab5e59fc31b5e906d1a3f666571abc367598e3e02c83403acabc092e07"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255a8ef062cbf6674450e668482456abac99a5583bbafb73f9ad469540a3a232"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a7cd62e831afe623fbb7aabbb4fe583212115b3ef38a9f6b71869ba644624a2"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f09e2ff1f17c2b51f2bc76d1cc33da96298f0a036a137f5440ab3ec5360b624f"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e38e63e6f3d1cec5a27e0afe90a085af8b6806ee208b33030e65b6516353f1a3"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0dbd8dbed2085ed23b5c04afa29d8fd2771674223135dc9bc937f3c09284d071"},
+    {file = "pydantic_core-2.23.4-cp310-none-win32.whl", hash = "sha256:6531b7ca5f951d663c339002e91aaebda765ec7d61b7d1e3991051906ddde119"},
+    {file = "pydantic_core-2.23.4-cp310-none-win_amd64.whl", hash = "sha256:7c9129eb40958b3d4500fa2467e6a83356b3b61bfff1b414c7361d9220f9ae8f"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64"},
+    {file = "pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f"},
+    {file = "pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24"},
+    {file = "pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84"},
+    {file = "pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"},
+    {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"},
+    {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d4488a93b071c04dc20f5cecc3631fc78b9789dd72483ba15d423b5b3689b555"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:81965a16b675b35e1d09dd14df53f190f9129c0202356ed44ab2728b1c905658"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffa2ebd4c8530079140dd2d7f794a9d9a73cbb8e9d59ffe24c63436efa8f271"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:61817945f2fe7d166e75fbfb28004034b48e44878177fc54d81688e7b85a3665"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29d2c342c4bc01b88402d60189f3df065fb0dda3654744d5a165a5288a657368"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e11661ce0fd30a6790e8bcdf263b9ec5988e95e63cf901972107efc49218b13"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d18368b137c6295db49ce7218b1a9ba15c5bc254c96d7c9f9e924a9bc7825ad"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec4e55f79b1c4ffb2eecd8a0cfba9955a2588497d96851f4c8f99aa4a1d39b12"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:374a5e5049eda9e0a44c696c7ade3ff355f06b1fe0bb945ea3cac2bc336478a2"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5c364564d17da23db1106787675fc7af45f2f7b58b4173bfdd105564e132e6fb"},
+    {file = "pydantic_core-2.23.4-cp38-none-win32.whl", hash = "sha256:d7a80d21d613eec45e3d41eb22f8f94ddc758a6c4720842dc74c0581f54993d6"},
+    {file = "pydantic_core-2.23.4-cp38-none-win_amd64.whl", hash = "sha256:5f5ff8d839f4566a474a969508fe1c5e59c31c80d9e140566f9a37bba7b8d556"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"},
+    {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"},
+    {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e08277a400de01bc72436a0ccd02bdf596631411f592ad985dcee21445bd0068"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f220b0eea5965dec25480b6333c788fb72ce5f9129e8759ef876a1d805d00801"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"},
+    {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
 ]
 
 [package.dependencies]
 typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
+[[package]]
+name = "pygments"
+version = "2.18.0"
+description = "Pygments is a syntax highlighting package written in Python."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"},
+    {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"},
+]
+
+[package.extras]
+windows-terminal = ["colorama (>=0.4.6)"]
+
 [[package]]
 name = "pytest"
 version = "7.4.4"
@@ -2251,73 +2628,75 @@ six = ">=1.5"
 
 [[package]]
 name = "pytz"
-version = "2024.1"
+version = "2024.2"
 description = "World timezone definitions, modern and historical"
 optional = true
 python-versions = "*"
 files = [
-    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
-    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
+    {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
+    {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
 ]
 
 [[package]]
 name = "pyyaml"
-version = "6.0.1"
+version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
-    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
-    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
-    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
-    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
-    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
-    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
 
 [[package]]
@@ -2337,90 +2716,105 @@ rpds-py = ">=0.7.0"
 
 [[package]]
 name = "regex"
-version = "2024.5.15"
+version = "2024.9.11"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "regex-2024.5.15-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a81e3cfbae20378d75185171587cbf756015ccb14840702944f014e0d93ea09f"},
-    {file = "regex-2024.5.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b59138b219ffa8979013be7bc85bb60c6f7b7575df3d56dc1e403a438c7a3f6"},
-    {file = "regex-2024.5.15-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a0bd000c6e266927cb7a1bc39d55be95c4b4f65c5be53e659537537e019232b1"},
-    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5eaa7ddaf517aa095fa8da0b5015c44d03da83f5bd49c87961e3c997daed0de7"},
-    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba68168daedb2c0bab7fd7e00ced5ba90aebf91024dea3c88ad5063c2a562cca"},
-    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6e8d717bca3a6e2064fc3a08df5cbe366369f4b052dcd21b7416e6d71620dca1"},
-    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5"},
-    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f9ebd0a36102fcad2f03696e8af4ae682793a5d30b46c647eaf280d6cfb32796"},
-    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9efa1a32ad3a3ea112224897cdaeb6aa00381627f567179c0314f7b65d354c62"},
-    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:1595f2d10dff3d805e054ebdc41c124753631b6a471b976963c7b28543cf13b0"},
-    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b802512f3e1f480f41ab5f2cfc0e2f761f08a1f41092d6718868082fc0d27143"},
-    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a0981022dccabca811e8171f913de05720590c915b033b7e601f35ce4ea7019f"},
-    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:19068a6a79cf99a19ccefa44610491e9ca02c2be3305c7760d3831d38a467a6f"},
-    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1b5269484f6126eee5e687785e83c6b60aad7663dafe842b34691157e5083e53"},
-    {file = "regex-2024.5.15-cp310-cp310-win32.whl", hash = "sha256:ada150c5adfa8fbcbf321c30c751dc67d2f12f15bd183ffe4ec7cde351d945b3"},
-    {file = "regex-2024.5.15-cp310-cp310-win_amd64.whl", hash = "sha256:ac394ff680fc46b97487941f5e6ae49a9f30ea41c6c6804832063f14b2a5a145"},
-    {file = "regex-2024.5.15-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f5b1dff3ad008dccf18e652283f5e5339d70bf8ba7c98bf848ac33db10f7bc7a"},
-    {file = "regex-2024.5.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c6a2b494a76983df8e3d3feea9b9ffdd558b247e60b92f877f93a1ff43d26656"},
-    {file = "regex-2024.5.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a32b96f15c8ab2e7d27655969a23895eb799de3665fa94349f3b2fbfd547236f"},
-    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10002e86e6068d9e1c91eae8295ef690f02f913c57db120b58fdd35a6bb1af35"},
-    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec54d5afa89c19c6dd8541a133be51ee1017a38b412b1321ccb8d6ddbeb4cf7d"},
-    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10e4ce0dca9ae7a66e6089bb29355d4432caed736acae36fef0fdd7879f0b0cb"},
-    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e507ff1e74373c4d3038195fdd2af30d297b4f0950eeda6f515ae3d84a1770f"},
-    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1f059a4d795e646e1c37665b9d06062c62d0e8cc3c511fe01315973a6542e40"},
-    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0721931ad5fe0dda45d07f9820b90b2148ccdd8e45bb9e9b42a146cb4f695649"},
-    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:833616ddc75ad595dee848ad984d067f2f31be645d603e4d158bba656bbf516c"},
-    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:287eb7f54fc81546346207c533ad3c2c51a8d61075127d7f6d79aaf96cdee890"},
-    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:19dfb1c504781a136a80ecd1fff9f16dddf5bb43cec6871778c8a907a085bb3d"},
-    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:119af6e56dce35e8dfb5222573b50c89e5508d94d55713c75126b753f834de68"},
-    {file = "regex-2024.5.15-cp311-cp311-win32.whl", hash = "sha256:1c1c174d6ec38d6c8a7504087358ce9213d4332f6293a94fbf5249992ba54efa"},
-    {file = "regex-2024.5.15-cp311-cp311-win_amd64.whl", hash = "sha256:9e717956dcfd656f5055cc70996ee2cc82ac5149517fc8e1b60261b907740201"},
-    {file = "regex-2024.5.15-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:632b01153e5248c134007209b5c6348a544ce96c46005d8456de1d552455b014"},
-    {file = "regex-2024.5.15-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e64198f6b856d48192bf921421fdd8ad8eb35e179086e99e99f711957ffedd6e"},
-    {file = "regex-2024.5.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68811ab14087b2f6e0fc0c2bae9ad689ea3584cad6917fc57be6a48bbd012c49"},
-    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8ec0c2fea1e886a19c3bee0cd19d862b3aa75dcdfb42ebe8ed30708df64687a"},
-    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d0c0c0003c10f54a591d220997dd27d953cd9ccc1a7294b40a4be5312be8797b"},
-    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2431b9e263af1953c55abbd3e2efca67ca80a3de8a0437cb58e2421f8184717a"},
-    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a605586358893b483976cffc1723fb0f83e526e8f14c6e6614e75919d9862cf"},
-    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391d7f7f1e409d192dba8bcd42d3e4cf9e598f3979cdaed6ab11288da88cb9f2"},
-    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9ff11639a8d98969c863d4617595eb5425fd12f7c5ef6621a4b74b71ed8726d5"},
-    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4eee78a04e6c67e8391edd4dad3279828dd66ac4b79570ec998e2155d2e59fd5"},
-    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8fe45aa3f4aa57faabbc9cb46a93363edd6197cbc43523daea044e9ff2fea83e"},
-    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:d0a3d8d6acf0c78a1fff0e210d224b821081330b8524e3e2bc5a68ef6ab5803d"},
-    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c486b4106066d502495b3025a0a7251bf37ea9540433940a23419461ab9f2a80"},
-    {file = "regex-2024.5.15-cp312-cp312-win32.whl", hash = "sha256:c49e15eac7c149f3670b3e27f1f28a2c1ddeccd3a2812cba953e01be2ab9b5fe"},
-    {file = "regex-2024.5.15-cp312-cp312-win_amd64.whl", hash = "sha256:673b5a6da4557b975c6c90198588181029c60793835ce02f497ea817ff647cb2"},
-    {file = "regex-2024.5.15-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:87e2a9c29e672fc65523fb47a90d429b70ef72b901b4e4b1bd42387caf0d6835"},
-    {file = "regex-2024.5.15-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c3bea0ba8b73b71b37ac833a7f3fd53825924165da6a924aec78c13032f20850"},
-    {file = "regex-2024.5.15-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bfc4f82cabe54f1e7f206fd3d30fda143f84a63fe7d64a81558d6e5f2e5aaba9"},
-    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5bb9425fe881d578aeca0b2b4b3d314ec88738706f66f219c194d67179337cb"},
-    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64c65783e96e563103d641760664125e91bd85d8e49566ee560ded4da0d3e704"},
-    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf2430df4148b08fb4324b848672514b1385ae3807651f3567871f130a728cc3"},
-    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5397de3219a8b08ae9540c48f602996aa6b0b65d5a61683e233af8605c42b0f2"},
-    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:455705d34b4154a80ead722f4f185b04c4237e8e8e33f265cd0798d0e44825fa"},
-    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b2b6f1b3bb6f640c1a92be3bbfbcb18657b125b99ecf141fb3310b5282c7d4ed"},
-    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:3ad070b823ca5890cab606c940522d05d3d22395d432f4aaaf9d5b1653e47ced"},
-    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5b5467acbfc153847d5adb21e21e29847bcb5870e65c94c9206d20eb4e99a384"},
-    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:e6662686aeb633ad65be2a42b4cb00178b3fbf7b91878f9446075c404ada552f"},
-    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:2b4c884767504c0e2401babe8b5b7aea9148680d2e157fa28f01529d1f7fcf67"},
-    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:3cd7874d57f13bf70078f1ff02b8b0aa48d5b9ed25fc48547516c6aba36f5741"},
-    {file = "regex-2024.5.15-cp38-cp38-win32.whl", hash = "sha256:e4682f5ba31f475d58884045c1a97a860a007d44938c4c0895f41d64481edbc9"},
-    {file = "regex-2024.5.15-cp38-cp38-win_amd64.whl", hash = "sha256:d99ceffa25ac45d150e30bd9ed14ec6039f2aad0ffa6bb87a5936f5782fc1569"},
-    {file = "regex-2024.5.15-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:13cdaf31bed30a1e1c2453ef6015aa0983e1366fad2667657dbcac7b02f67133"},
-    {file = "regex-2024.5.15-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cac27dcaa821ca271855a32188aa61d12decb6fe45ffe3e722401fe61e323cd1"},
-    {file = "regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7dbe2467273b875ea2de38ded4eba86cbcbc9a1a6d0aa11dcf7bd2e67859c435"},
-    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64f18a9a3513a99c4bef0e3efd4c4a5b11228b48aa80743be822b71e132ae4f5"},
-    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d347a741ea871c2e278fde6c48f85136c96b8659b632fb57a7d1ce1872547600"},
-    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1878b8301ed011704aea4c806a3cadbd76f84dece1ec09cc9e4dc934cfa5d4da"},
-    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4babf07ad476aaf7830d77000874d7611704a7fcf68c9c2ad151f5d94ae4bfc4"},
-    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:35cb514e137cb3488bce23352af3e12fb0dbedd1ee6e60da053c69fb1b29cc6c"},
-    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cdd09d47c0b2efee9378679f8510ee6955d329424c659ab3c5e3a6edea696294"},
-    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:72d7a99cd6b8f958e85fc6ca5b37c4303294954eac1376535b03c2a43eb72629"},
-    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a094801d379ab20c2135529948cb84d417a2169b9bdceda2a36f5f10977ebc16"},
-    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c0c18345010870e58238790a6779a1219b4d97bd2e77e1140e8ee5d14df071aa"},
-    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:16093f563098448ff6b1fa68170e4acbef94e6b6a4e25e10eae8598bb1694b5d"},
-    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e38a7d4e8f633a33b4c7350fbd8bad3b70bf81439ac67ac38916c4a86b465456"},
-    {file = "regex-2024.5.15-cp39-cp39-win32.whl", hash = "sha256:71a455a3c584a88f654b64feccc1e25876066c4f5ef26cd6dd711308aa538694"},
-    {file = "regex-2024.5.15-cp39-cp39-win_amd64.whl", hash = "sha256:cab12877a9bdafde5500206d1020a584355a97884dfd388af3699e9137bf7388"},
-    {file = "regex-2024.5.15.tar.gz", hash = "sha256:d3ee02d9e5f482cc8309134a91eeaacbdd2261ba111b0fef3748eeb4913e6a2c"},
+    {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1494fa8725c285a81d01dc8c06b55287a1ee5e0e382d8413adc0a9197aac6408"},
+    {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e12c481ad92d129c78f13a2a3662317e46ee7ef96c94fd332e1c29131875b7d"},
+    {file = "regex-2024.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:16e13a7929791ac1216afde26f712802e3df7bf0360b32e4914dca3ab8baeea5"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46989629904bad940bbec2106528140a218b4a36bb3042d8406980be1941429c"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a906ed5e47a0ce5f04b2c981af1c9acf9e8696066900bf03b9d7879a6f679fc8"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9a091b0550b3b0207784a7d6d0f1a00d1d1c8a11699c1a4d93db3fbefc3ad35"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ddcd9a179c0a6fa8add279a4444015acddcd7f232a49071ae57fa6e278f1f71"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b41e1adc61fa347662b09398e31ad446afadff932a24807d3ceb955ed865cc8"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ced479f601cd2f8ca1fd7b23925a7e0ad512a56d6e9476f79b8f381d9d37090a"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:635a1d96665f84b292e401c3d62775851aedc31d4f8784117b3c68c4fcd4118d"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c0256beda696edcf7d97ef16b2a33a8e5a875affd6fa6567b54f7c577b30a137"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:3ce4f1185db3fbde8ed8aa223fc9620f276c58de8b0d4f8cc86fd1360829edb6"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:09d77559e80dcc9d24570da3745ab859a9cf91953062e4ab126ba9d5993688ca"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a22ccefd4db3f12b526eccb129390942fe874a3a9fdbdd24cf55773a1faab1a"},
+    {file = "regex-2024.9.11-cp310-cp310-win32.whl", hash = "sha256:f745ec09bc1b0bd15cfc73df6fa4f726dcc26bb16c23a03f9e3367d357eeedd0"},
+    {file = "regex-2024.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:01c2acb51f8a7d6494c8c5eafe3d8e06d76563d8a8a4643b37e9b2dd8a2ff623"},
+    {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2cce2449e5927a0bf084d346da6cd5eb016b2beca10d0013ab50e3c226ffc0df"},
+    {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b37fa423beefa44919e009745ccbf353d8c981516e807995b2bd11c2c77d268"},
+    {file = "regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:64ce2799bd75039b480cc0360907c4fb2f50022f030bf9e7a8705b636e408fad"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4cc92bb6db56ab0c1cbd17294e14f5e9224f0cc6521167ef388332604e92679"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d05ac6fa06959c4172eccd99a222e1fbf17b5670c4d596cb1e5cde99600674c4"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:040562757795eeea356394a7fb13076ad4f99d3c62ab0f8bdfb21f99a1f85664"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6113c008a7780792efc80f9dfe10ba0cd043cbf8dc9a76ef757850f51b4edc50"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e5fb5f77c8745a60105403a774fe2c1759b71d3e7b4ca237a5e67ad066c7199"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:54d9ff35d4515debf14bc27f1e3b38bfc453eff3220f5bce159642fa762fe5d4"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:df5cbb1fbc74a8305b6065d4ade43b993be03dbe0f8b30032cced0d7740994bd"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7fb89ee5d106e4a7a51bce305ac4efb981536301895f7bdcf93ec92ae0d91c7f"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a738b937d512b30bf75995c0159c0ddf9eec0775c9d72ac0202076c72f24aa96"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e28f9faeb14b6f23ac55bfbbfd3643f5c7c18ede093977f1df249f73fd22c7b1"},
+    {file = "regex-2024.9.11-cp311-cp311-win32.whl", hash = "sha256:18e707ce6c92d7282dfce370cd205098384b8ee21544e7cb29b8aab955b66fa9"},
+    {file = "regex-2024.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:313ea15e5ff2a8cbbad96ccef6be638393041b0a7863183c2d31e0c6116688cf"},
+    {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b0d0a6c64fcc4ef9c69bd5b3b3626cc3776520a1637d8abaa62b9edc147a58f7"},
+    {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:49b0e06786ea663f933f3710a51e9385ce0cba0ea56b67107fd841a55d56a231"},
+    {file = "regex-2024.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5b513b6997a0b2f10e4fd3a1313568e373926e8c252bd76c960f96fd039cd28d"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee439691d8c23e76f9802c42a95cfeebf9d47cf4ffd06f18489122dbb0a7ad64"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a8f877c89719d759e52783f7fe6e1c67121076b87b40542966c02de5503ace42"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23b30c62d0f16827f2ae9f2bb87619bc4fba2044911e2e6c2eb1af0161cdb766"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85ab7824093d8f10d44330fe1e6493f756f252d145323dd17ab6b48733ff6c0a"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8dee5b4810a89447151999428fe096977346cf2f29f4d5e29609d2e19e0199c9"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:98eeee2f2e63edae2181c886d7911ce502e1292794f4c5ee71e60e23e8d26b5d"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:57fdd2e0b2694ce6fc2e5ccf189789c3e2962916fb38779d3e3521ff8fe7a822"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d552c78411f60b1fdaafd117a1fca2f02e562e309223b9d44b7de8be451ec5e0"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a0b2b80321c2ed3fcf0385ec9e51a12253c50f146fddb2abbb10f033fe3d049a"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:18406efb2f5a0e57e3a5881cd9354c1512d3bb4f5c45d96d110a66114d84d23a"},
+    {file = "regex-2024.9.11-cp312-cp312-win32.whl", hash = "sha256:e464b467f1588e2c42d26814231edecbcfe77f5ac414d92cbf4e7b55b2c2a776"},
+    {file = "regex-2024.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:9e8719792ca63c6b8340380352c24dcb8cd7ec49dae36e963742a275dfae6009"},
+    {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c157bb447303070f256e084668b702073db99bbb61d44f85d811025fcf38f784"},
+    {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4db21ece84dfeefc5d8a3863f101995de646c6cb0536952c321a2650aa202c36"},
+    {file = "regex-2024.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:220e92a30b426daf23bb67a7962900ed4613589bab80382be09b48896d211e92"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1ae19e64c14c7ec1995f40bd932448713d3c73509e82d8cd7744dc00e29e86"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f47cd43a5bfa48f86925fe26fbdd0a488ff15b62468abb5d2a1e092a4fb10e85"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9d4a76b96f398697fe01117093613166e6aa8195d63f1b4ec3f21ab637632963"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ea51dcc0835eea2ea31d66456210a4e01a076d820e9039b04ae8d17ac11dee6"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7aaa315101c6567a9a45d2839322c51c8d6e81f67683d529512f5bcfb99c802"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c57d08ad67aba97af57a7263c2d9006d5c404d721c5f7542f077f109ec2a4a29"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f8404bf61298bb6f8224bb9176c1424548ee1181130818fcd2cbffddc768bed8"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dd4490a33eb909ef5078ab20f5f000087afa2a4daa27b4c072ccb3cb3050ad84"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:eee9130eaad130649fd73e5cd92f60e55708952260ede70da64de420cdcad554"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6a2644a93da36c784e546de579ec1806bfd2763ef47babc1b03d765fe560c9f8"},
+    {file = "regex-2024.9.11-cp313-cp313-win32.whl", hash = "sha256:e997fd30430c57138adc06bba4c7c2968fb13d101e57dd5bb9355bf8ce3fa7e8"},
+    {file = "regex-2024.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:042c55879cfeb21a8adacc84ea347721d3d83a159da6acdf1116859e2427c43f"},
+    {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:35f4a6f96aa6cb3f2f7247027b07b15a374f0d5b912c0001418d1d55024d5cb4"},
+    {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:55b96e7ce3a69a8449a66984c268062fbaa0d8ae437b285428e12797baefce7e"},
+    {file = "regex-2024.9.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cb130fccd1a37ed894824b8c046321540263013da72745d755f2d35114b81a60"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:323c1f04be6b2968944d730e5c2091c8c89767903ecaa135203eec4565ed2b2b"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be1c8ed48c4c4065ecb19d882a0ce1afe0745dfad8ce48c49586b90a55f02366"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b5b029322e6e7b94fff16cd120ab35a253236a5f99a79fb04fda7ae71ca20ae8"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6fff13ef6b5f29221d6904aa816c34701462956aa72a77f1f151a8ec4f56aeb"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:587d4af3979376652010e400accc30404e6c16b7df574048ab1f581af82065e4"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:079400a8269544b955ffa9e31f186f01d96829110a3bf79dc338e9910f794fca"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f9268774428ec173654985ce55fc6caf4c6d11ade0f6f914d48ef4719eb05ebb"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:23f9985c8784e544d53fc2930fc1ac1a7319f5d5332d228437acc9f418f2f168"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ae2941333154baff9838e88aa71c1d84f4438189ecc6021a12c7573728b5838e"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:e93f1c331ca8e86fe877a48ad64e77882c0c4da0097f2212873a69bbfea95d0c"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:846bc79ee753acf93aef4184c040d709940c9d001029ceb7b7a52747b80ed2dd"},
+    {file = "regex-2024.9.11-cp38-cp38-win32.whl", hash = "sha256:c94bb0a9f1db10a1d16c00880bdebd5f9faf267273b8f5bd1878126e0fbde771"},
+    {file = "regex-2024.9.11-cp38-cp38-win_amd64.whl", hash = "sha256:2b08fce89fbd45664d3df6ad93e554b6c16933ffa9d55cb7e01182baaf971508"},
+    {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:07f45f287469039ffc2c53caf6803cd506eb5f5f637f1d4acb37a738f71dd066"},
+    {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4838e24ee015101d9f901988001038f7f0d90dc0c3b115541a1365fb439add62"},
+    {file = "regex-2024.9.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6edd623bae6a737f10ce853ea076f56f507fd7726bee96a41ee3d68d347e4d16"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c69ada171c2d0e97a4b5aa78fbb835e0ffbb6b13fc5da968c09811346564f0d3"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02087ea0a03b4af1ed6ebab2c54d7118127fee8d71b26398e8e4b05b78963199"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69dee6a020693d12a3cf892aba4808fe168d2a4cef368eb9bf74f5398bfd4ee8"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:297f54910247508e6e5cae669f2bc308985c60540a4edd1c77203ef19bfa63ca"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ecea58b43a67b1b79805f1a0255730edaf5191ecef84dbc4cc85eb30bc8b63b9"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eab4bb380f15e189d1313195b062a6aa908f5bd687a0ceccd47c8211e9cf0d4a"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0cbff728659ce4bbf4c30b2a1be040faafaa9eca6ecde40aaff86f7889f4ab39"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:54c4a097b8bc5bb0dfc83ae498061d53ad7b5762e00f4adaa23bee22b012e6ba"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:73d6d2f64f4d894c96626a75578b0bf7d9e56dcda8c3d037a2118fdfe9b1c664"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:e53b5fbab5d675aec9f0c501274c467c0f9a5d23696cfc94247e1fb56501ed89"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0ffbcf9221e04502fc35e54d1ce9567541979c3fdfb93d2c554f0ca583a19b35"},
+    {file = "regex-2024.9.11-cp39-cp39-win32.whl", hash = "sha256:e4c22e1ac1f1ec1e09f72e6c44d8f2244173db7eb9629cc3a346a8d7ccc31142"},
+    {file = "regex-2024.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:faa3c142464efec496967359ca99696c896c591c56c53506bac1ad465f66e919"},
+    {file = "regex-2024.9.11.tar.gz", hash = "sha256:6c188c307e8433bcb63dc1915022deb553b4203a70722fc542c363bf120a01fd"},
 ]
 
 [[package]]
@@ -2444,221 +2838,254 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "rich"
+version = "13.9.3"
+description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "rich-13.9.3-py3-none-any.whl", hash = "sha256:9836f5096eb2172c9e77df411c1b009bace4193d6a481d534fea75ebba758283"},
+    {file = "rich-13.9.3.tar.gz", hash = "sha256:bc1e01b899537598cf02579d2b9f4a415104d3fc439313a7a2c165d76557a08e"},
+]
+
+[package.dependencies]
+markdown-it-py = ">=2.2.0"
+pygments = ">=2.13.0,<3.0.0"
+typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+jupyter = ["ipywidgets (>=7.5.1,<9)"]
+
 [[package]]
 name = "rpds-py"
-version = "0.18.1"
+version = "0.20.0"
 description = "Python bindings to Rust's persistent data structures (rpds)"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "rpds_py-0.18.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:d31dea506d718693b6b2cffc0648a8929bdc51c70a311b2770f09611caa10d53"},
-    {file = "rpds_py-0.18.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:732672fbc449bab754e0b15356c077cc31566df874964d4801ab14f71951ea80"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a98a1f0552b5f227a3d6422dbd61bc6f30db170939bd87ed14f3c339aa6c7c9"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f1944ce16401aad1e3f7d312247b3d5de7981f634dc9dfe90da72b87d37887d"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38e14fb4e370885c4ecd734f093a2225ee52dc384b86fa55fe3f74638b2cfb09"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08d74b184f9ab6289b87b19fe6a6d1a97fbfea84b8a3e745e87a5de3029bf944"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d70129cef4a8d979caa37e7fe957202e7eee8ea02c5e16455bc9808a59c6b2f0"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce0bb20e3a11bd04461324a6a798af34d503f8d6f1aa3d2aa8901ceaf039176d"},
-    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81c5196a790032e0fc2464c0b4ab95f8610f96f1f2fa3d4deacce6a79852da60"},
-    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f3027be483868c99b4985fda802a57a67fdf30c5d9a50338d9db646d590198da"},
-    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d44607f98caa2961bab4fa3c4309724b185b464cdc3ba6f3d7340bac3ec97cc1"},
-    {file = "rpds_py-0.18.1-cp310-none-win32.whl", hash = "sha256:c273e795e7a0f1fddd46e1e3cb8be15634c29ae8ff31c196debb620e1edb9333"},
-    {file = "rpds_py-0.18.1-cp310-none-win_amd64.whl", hash = "sha256:8352f48d511de5f973e4f2f9412736d7dea76c69faa6d36bcf885b50c758ab9a"},
-    {file = "rpds_py-0.18.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6b5ff7e1d63a8281654b5e2896d7f08799378e594f09cf3674e832ecaf396ce8"},
-    {file = "rpds_py-0.18.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8927638a4d4137a289e41d0fd631551e89fa346d6dbcfc31ad627557d03ceb6d"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:154bf5c93d79558b44e5b50cc354aa0459e518e83677791e6adb0b039b7aa6a7"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07f2139741e5deb2c5154a7b9629bc5aa48c766b643c1a6750d16f865a82c5fc"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c7672e9fba7425f79019db9945b16e308ed8bc89348c23d955c8c0540da0a07"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:489bdfe1abd0406eba6b3bb4fdc87c7fa40f1031de073d0cfb744634cc8fa261"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c20f05e8e3d4fc76875fc9cb8cf24b90a63f5a1b4c5b9273f0e8225e169b100"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:967342e045564cef76dfcf1edb700b1e20838d83b1aa02ab313e6a497cf923b8"},
-    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cc7c1a47f3a63282ab0f422d90ddac4aa3034e39fc66a559ab93041e6505da7"},
-    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f7afbfee1157e0f9376c00bb232e80a60e59ed716e3211a80cb8506550671e6e"},
-    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9e6934d70dc50f9f8ea47081ceafdec09245fd9f6032669c3b45705dea096b88"},
-    {file = "rpds_py-0.18.1-cp311-none-win32.whl", hash = "sha256:c69882964516dc143083d3795cb508e806b09fc3800fd0d4cddc1df6c36e76bb"},
-    {file = "rpds_py-0.18.1-cp311-none-win_amd64.whl", hash = "sha256:70a838f7754483bcdc830444952fd89645569e7452e3226de4a613a4c1793fb2"},
-    {file = "rpds_py-0.18.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3dd3cd86e1db5aadd334e011eba4e29d37a104b403e8ca24dcd6703c68ca55b3"},
-    {file = "rpds_py-0.18.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05f3d615099bd9b13ecf2fc9cf2d839ad3f20239c678f461c753e93755d629ee"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35b2b771b13eee8729a5049c976197ff58a27a3829c018a04341bcf1ae409b2b"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee17cd26b97d537af8f33635ef38be873073d516fd425e80559f4585a7b90c43"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b646bf655b135ccf4522ed43d6902af37d3f5dbcf0da66c769a2b3938b9d8184"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19ba472b9606c36716062c023afa2484d1e4220548751bda14f725a7de17b4f6"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e30ac5e329098903262dc5bdd7e2086e0256aa762cc8b744f9e7bf2a427d3f8"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d58ad6317d188c43750cb76e9deacf6051d0f884d87dc6518e0280438648a9ac"},
-    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e1735502458621921cee039c47318cb90b51d532c2766593be6207eec53e5c4c"},
-    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f5bab211605d91db0e2995a17b5c6ee5edec1270e46223e513eaa20da20076ac"},
-    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2fc24a329a717f9e2448f8cd1f960f9dac4e45b6224d60734edeb67499bab03a"},
-    {file = "rpds_py-0.18.1-cp312-none-win32.whl", hash = "sha256:1805d5901779662d599d0e2e4159d8a82c0b05faa86ef9222bf974572286b2b6"},
-    {file = "rpds_py-0.18.1-cp312-none-win_amd64.whl", hash = "sha256:720edcb916df872d80f80a1cc5ea9058300b97721efda8651efcd938a9c70a72"},
-    {file = "rpds_py-0.18.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:c827576e2fa017a081346dce87d532a5310241648eb3700af9a571a6e9fc7e74"},
-    {file = "rpds_py-0.18.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:aa3679e751408d75a0b4d8d26d6647b6d9326f5e35c00a7ccd82b78ef64f65f8"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0abeee75434e2ee2d142d650d1e54ac1f8b01e6e6abdde8ffd6eeac6e9c38e20"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed402d6153c5d519a0faf1bb69898e97fb31613b49da27a84a13935ea9164dfc"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:338dee44b0cef8b70fd2ef54b4e09bb1b97fc6c3a58fea5db6cc083fd9fc2724"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7750569d9526199c5b97e5a9f8d96a13300950d910cf04a861d96f4273d5b104"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:607345bd5912aacc0c5a63d45a1f73fef29e697884f7e861094e443187c02be5"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:207c82978115baa1fd8d706d720b4a4d2b0913df1c78c85ba73fe6c5804505f0"},
-    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6d1e42d2735d437e7e80bab4d78eb2e459af48c0a46e686ea35f690b93db792d"},
-    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5463c47c08630007dc0fe99fb480ea4f34a89712410592380425a9b4e1611d8e"},
-    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:06d218939e1bf2ca50e6b0ec700ffe755e5216a8230ab3e87c059ebb4ea06afc"},
-    {file = "rpds_py-0.18.1-cp38-none-win32.whl", hash = "sha256:312fe69b4fe1ffbe76520a7676b1e5ac06ddf7826d764cc10265c3b53f96dbe9"},
-    {file = "rpds_py-0.18.1-cp38-none-win_amd64.whl", hash = "sha256:9437ca26784120a279f3137ee080b0e717012c42921eb07861b412340f85bae2"},
-    {file = "rpds_py-0.18.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:19e515b78c3fc1039dd7da0a33c28c3154458f947f4dc198d3c72db2b6b5dc93"},
-    {file = "rpds_py-0.18.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a7b28c5b066bca9a4eb4e2f2663012debe680f097979d880657f00e1c30875a0"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:673fdbbf668dd958eff750e500495ef3f611e2ecc209464f661bc82e9838991e"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d960de62227635d2e61068f42a6cb6aae91a7fe00fca0e3aeed17667c8a34611"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:352a88dc7892f1da66b6027af06a2e7e5d53fe05924cc2cfc56495b586a10b72"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e0ee01ad8260184db21468a6e1c37afa0529acc12c3a697ee498d3c2c4dcaf3"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4c39ad2f512b4041343ea3c7894339e4ca7839ac38ca83d68a832fc8b3748ab"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aaa71ee43a703c321906813bb252f69524f02aa05bf4eec85f0c41d5d62d0f4c"},
-    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6cd8098517c64a85e790657e7b1e509b9fe07487fd358e19431cb120f7d96338"},
-    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4adec039b8e2928983f885c53b7cc4cda8965b62b6596501a0308d2703f8af1b"},
-    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:32b7daaa3e9389db3695964ce8e566e3413b0c43e3394c05e4b243a4cd7bef26"},
-    {file = "rpds_py-0.18.1-cp39-none-win32.whl", hash = "sha256:2625f03b105328729f9450c8badda34d5243231eef6535f80064d57035738360"},
-    {file = "rpds_py-0.18.1-cp39-none-win_amd64.whl", hash = "sha256:bf18932d0003c8c4d51a39f244231986ab23ee057d235a12b2684ea26a353590"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cbfbea39ba64f5e53ae2915de36f130588bba71245b418060ec3330ebf85678e"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3d456ff2a6a4d2adcdf3c1c960a36f4fd2fec6e3b4902a42a384d17cf4e7a65"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7700936ef9d006b7ef605dc53aa364da2de5a3aa65516a1f3ce73bf82ecfc7ae"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:51584acc5916212e1bf45edd17f3a6b05fe0cbb40482d25e619f824dccb679de"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:942695a206a58d2575033ff1e42b12b2aece98d6003c6bc739fbf33d1773b12f"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b906b5f58892813e5ba5c6056d6a5ad08f358ba49f046d910ad992196ea61397"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6f8e3fecca256fefc91bb6765a693d96692459d7d4c644660a9fff32e517843"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7732770412bab81c5a9f6d20aeb60ae943a9b36dcd990d876a773526468e7163"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bd1105b50ede37461c1d51b9698c4f4be6e13e69a908ab7751e3807985fc0346"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:618916f5535784960f3ecf8111581f4ad31d347c3de66d02e728de460a46303c"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:17c6d2155e2423f7e79e3bb18151c686d40db42d8645e7977442170c360194d4"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6c4c4c3f878df21faf5fac86eda32671c27889e13570645a9eea0a1abdd50922"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:fab6ce90574645a0d6c58890e9bcaac8d94dff54fb51c69e5522a7358b80ab64"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531796fb842b53f2695e94dc338929e9f9dbf473b64710c28af5a160b2a8927d"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:740884bc62a5e2bbb31e584f5d23b32320fd75d79f916f15a788d527a5e83644"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:998125738de0158f088aef3cb264a34251908dd2e5d9966774fdab7402edfab7"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2be6e9dd4111d5b31ba3b74d17da54a8319d8168890fbaea4b9e5c3de630ae5"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0cee71bc618cd93716f3c1bf56653740d2d13ddbd47673efa8bf41435a60daa"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2c3caec4ec5cd1d18e5dd6ae5194d24ed12785212a90b37f5f7f06b8bedd7139"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:27bba383e8c5231cd559affe169ca0b96ec78d39909ffd817f28b166d7ddd4d8"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:a888e8bdb45916234b99da2d859566f1e8a1d2275a801bb8e4a9644e3c7e7909"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6031b25fb1b06327b43d841f33842b383beba399884f8228a6bb3df3088485ff"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48c2faaa8adfacefcbfdb5f2e2e7bdad081e5ace8d182e5f4ade971f128e6bb3"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d85164315bd68c0806768dc6bb0429c6f95c354f87485ee3593c4f6b14def2bd"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6afd80f6c79893cfc0574956f78a0add8c76e3696f2d6a15bca2c66c415cf2d4"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa242ac1ff583e4ec7771141606aafc92b361cd90a05c30d93e343a0c2d82a89"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21be4770ff4e08698e1e8e0bce06edb6ea0626e7c8f560bc08222880aca6a6f"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c45a639e93a0c5d4b788b2613bd637468edd62f8f95ebc6fcc303d58ab3f0a8"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:910e71711d1055b2768181efa0a17537b2622afeb0424116619817007f8a2b10"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b9bb1f182a97880f6078283b3505a707057c42bf55d8fca604f70dedfdc0772a"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1d54f74f40b1f7aaa595a02ff42ef38ca654b1469bef7d52867da474243cc633"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8d2e182c9ee01135e11e9676e9a62dfad791a7a467738f06726872374a83db49"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a15acc588f70fda1661234761f9ed9ad79ebed3f2125d44be0862708b666e"},
-    {file = "rpds_py-0.18.1.tar.gz", hash = "sha256:dc48b479d540770c811fbd1eb9ba2bb66951863e448efec2e2c102625328e92f"},
+    {file = "rpds_py-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3ad0fda1635f8439cde85c700f964b23ed5fc2d28016b32b9ee5fe30da5c84e2"},
+    {file = "rpds_py-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9bb4a0d90fdb03437c109a17eade42dfbf6190408f29b2744114d11586611d6f"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6377e647bbfd0a0b159fe557f2c6c602c159fc752fa316572f012fc0bf67150"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb851b7df9dda52dc1415ebee12362047ce771fc36914586b2e9fcbd7d293b3e"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e0f80b739e5a8f54837be5d5c924483996b603d5502bfff79bf33da06164ee2"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a8c94dad2e45324fc74dce25e1645d4d14df9a4e54a30fa0ae8bad9a63928e3"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8e604fe73ba048c06085beaf51147eaec7df856824bfe7b98657cf436623daf"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df3de6b7726b52966edf29663e57306b23ef775faf0ac01a3e9f4012a24a4140"},
+    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf258ede5bc22a45c8e726b29835b9303c285ab46fc7c3a4cc770736b5304c9f"},
+    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:55fea87029cded5df854ca7e192ec7bdb7ecd1d9a3f63d5c4eb09148acf4a7ce"},
+    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ae94bd0b2f02c28e199e9bc51485d0c5601f58780636185660f86bf80c89af94"},
+    {file = "rpds_py-0.20.0-cp310-none-win32.whl", hash = "sha256:28527c685f237c05445efec62426d285e47a58fb05ba0090a4340b73ecda6dee"},
+    {file = "rpds_py-0.20.0-cp310-none-win_amd64.whl", hash = "sha256:238a2d5b1cad28cdc6ed15faf93a998336eb041c4e440dd7f902528b8891b399"},
+    {file = "rpds_py-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac2f4f7a98934c2ed6505aead07b979e6f999389f16b714448fb39bbaa86a489"},
+    {file = "rpds_py-0.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:220002c1b846db9afd83371d08d239fdc865e8f8c5795bbaec20916a76db3318"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d7919548df3f25374a1f5d01fbcd38dacab338ef5f33e044744b5c36729c8db"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:758406267907b3781beee0f0edfe4a179fbd97c0be2e9b1154d7f0a1279cf8e5"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d61339e9f84a3f0767b1995adfb171a0d00a1185192718a17af6e124728e0f5"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1259c7b3705ac0a0bd38197565a5d603218591d3f6cee6e614e380b6ba61c6f6"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c1dc0f53856b9cc9a0ccca0a7cc61d3d20a7088201c0937f3f4048c1718a209"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7e60cb630f674a31f0368ed32b2a6b4331b8350d67de53c0359992444b116dd3"},
+    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbe982f38565bb50cb7fb061ebf762c2f254ca3d8c20d4006878766e84266272"},
+    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:514b3293b64187172bc77c8fb0cdae26981618021053b30d8371c3a902d4d5ad"},
+    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0a26ffe9d4dd35e4dfdd1e71f46401cff0181c75ac174711ccff0459135fa58"},
+    {file = "rpds_py-0.20.0-cp311-none-win32.whl", hash = "sha256:89c19a494bf3ad08c1da49445cc5d13d8fefc265f48ee7e7556839acdacf69d0"},
+    {file = "rpds_py-0.20.0-cp311-none-win_amd64.whl", hash = "sha256:c638144ce971df84650d3ed0096e2ae7af8e62ecbbb7b201c8935c370df00a2c"},
+    {file = "rpds_py-0.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a84ab91cbe7aab97f7446652d0ed37d35b68a465aeef8fc41932a9d7eee2c1a6"},
+    {file = "rpds_py-0.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:56e27147a5a4c2c21633ff8475d185734c0e4befd1c989b5b95a5d0db699b21b"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2580b0c34583b85efec8c5c5ec9edf2dfe817330cc882ee972ae650e7b5ef739"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b80d4a7900cf6b66bb9cee5c352b2d708e29e5a37fe9bf784fa97fc11504bf6c"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50eccbf054e62a7b2209b28dc7a22d6254860209d6753e6b78cfaeb0075d7bee"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:49a8063ea4296b3a7e81a5dfb8f7b2d73f0b1c20c2af401fb0cdf22e14711a96"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea438162a9fcbee3ecf36c23e6c68237479f89f962f82dae83dc15feeceb37e4"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:18d7585c463087bddcfa74c2ba267339f14f2515158ac4db30b1f9cbdb62c8ef"},
+    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d4c7d1a051eeb39f5c9547e82ea27cbcc28338482242e3e0b7768033cb083821"},
+    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4df1e3b3bec320790f699890d41c59d250f6beda159ea3c44c3f5bac1976940"},
+    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2cf126d33a91ee6eedc7f3197b53e87a2acdac63602c0f03a02dd69e4b138174"},
+    {file = "rpds_py-0.20.0-cp312-none-win32.whl", hash = "sha256:8bc7690f7caee50b04a79bf017a8d020c1f48c2a1077ffe172abec59870f1139"},
+    {file = "rpds_py-0.20.0-cp312-none-win_amd64.whl", hash = "sha256:0e13e6952ef264c40587d510ad676a988df19adea20444c2b295e536457bc585"},
+    {file = "rpds_py-0.20.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:aa9a0521aeca7d4941499a73ad7d4f8ffa3d1affc50b9ea11d992cd7eff18a29"},
+    {file = "rpds_py-0.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a1f1d51eccb7e6c32ae89243cb352389228ea62f89cd80823ea7dd1b98e0b91"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a86a9b96070674fc88b6f9f71a97d2c1d3e5165574615d1f9168ecba4cecb24"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c8ef2ebf76df43f5750b46851ed1cdf8f109d7787ca40035fe19fbdc1acc5a7"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b74b25f024b421d5859d156750ea9a65651793d51b76a2e9238c05c9d5f203a9"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57eb94a8c16ab08fef6404301c38318e2c5a32216bf5de453e2714c964c125c8"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1940dae14e715e2e02dfd5b0f64a52e8374a517a1e531ad9412319dc3ac7879"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d20277fd62e1b992a50c43f13fbe13277a31f8c9f70d59759c88f644d66c619f"},
+    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:06db23d43f26478303e954c34c75182356ca9aa7797d22c5345b16871ab9c45c"},
+    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2a5db5397d82fa847e4c624b0c98fe59d2d9b7cf0ce6de09e4d2e80f8f5b3f2"},
+    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a35df9f5548fd79cb2f52d27182108c3e6641a4feb0f39067911bf2adaa3e57"},
+    {file = "rpds_py-0.20.0-cp313-none-win32.whl", hash = "sha256:fd2d84f40633bc475ef2d5490b9c19543fbf18596dcb1b291e3a12ea5d722f7a"},
+    {file = "rpds_py-0.20.0-cp313-none-win_amd64.whl", hash = "sha256:9bc2d153989e3216b0559251b0c260cfd168ec78b1fac33dd485750a228db5a2"},
+    {file = "rpds_py-0.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2fbf7db2012d4876fb0d66b5b9ba6591197b0f165db8d99371d976546472a24"},
+    {file = "rpds_py-0.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1e5f3cd7397c8f86c8cc72d5a791071431c108edd79872cdd96e00abd8497d29"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce9845054c13696f7af7f2b353e6b4f676dab1b4b215d7fe5e05c6f8bb06f965"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c3e130fd0ec56cb76eb49ef52faead8ff09d13f4527e9b0c400307ff72b408e1"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b16aa0107ecb512b568244ef461f27697164d9a68d8b35090e9b0c1c8b27752"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa7f429242aae2947246587d2964fad750b79e8c233a2367f71b554e9447949c"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af0fc424a5842a11e28956e69395fbbeab2c97c42253169d87e90aac2886d751"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b8c00a3b1e70c1d3891f0db1b05292747f0dbcfb49c43f9244d04c70fbc40eb8"},
+    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40ce74fc86ee4645d0a225498d091d8bc61f39b709ebef8204cb8b5a464d3c0e"},
+    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4fe84294c7019456e56d93e8ababdad5a329cd25975be749c3f5f558abb48253"},
+    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:338ca4539aad4ce70a656e5187a3a31c5204f261aef9f6ab50e50bcdffaf050a"},
+    {file = "rpds_py-0.20.0-cp38-none-win32.whl", hash = "sha256:54b43a2b07db18314669092bb2de584524d1ef414588780261e31e85846c26a5"},
+    {file = "rpds_py-0.20.0-cp38-none-win_amd64.whl", hash = "sha256:a1862d2d7ce1674cffa6d186d53ca95c6e17ed2b06b3f4c476173565c862d232"},
+    {file = "rpds_py-0.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3fde368e9140312b6e8b6c09fb9f8c8c2f00999d1823403ae90cc00480221b22"},
+    {file = "rpds_py-0.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9824fb430c9cf9af743cf7aaf6707bf14323fb51ee74425c380f4c846ea70789"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11ef6ce74616342888b69878d45e9f779b95d4bd48b382a229fe624a409b72c5"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c52d3f2f82b763a24ef52f5d24358553e8403ce05f893b5347098014f2d9eff2"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d35cef91e59ebbeaa45214861874bc6f19eb35de96db73e467a8358d701a96c"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d72278a30111e5b5525c1dd96120d9e958464316f55adb030433ea905866f4de"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c29cbbba378759ac5786730d1c3cb4ec6f8ababf5c42a9ce303dc4b3d08cda"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6632f2d04f15d1bd6fe0eedd3b86d9061b836ddca4c03d5cf5c7e9e6b7c14580"},
+    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d0b67d87bb45ed1cd020e8fbf2307d449b68abc45402fe1a4ac9e46c3c8b192b"},
+    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ec31a99ca63bf3cd7f1a5ac9fe95c5e2d060d3c768a09bc1d16e235840861420"},
+    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22e6c9976e38f4d8c4a63bd8a8edac5307dffd3ee7e6026d97f3cc3a2dc02a0b"},
+    {file = "rpds_py-0.20.0-cp39-none-win32.whl", hash = "sha256:569b3ea770c2717b730b61998b6c54996adee3cef69fc28d444f3e7920313cf7"},
+    {file = "rpds_py-0.20.0-cp39-none-win_amd64.whl", hash = "sha256:e6900ecdd50ce0facf703f7a00df12374b74bbc8ad9fe0f6559947fb20f82364"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:617c7357272c67696fd052811e352ac54ed1d9b49ab370261a80d3b6ce385045"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9426133526f69fcaba6e42146b4e12d6bc6c839b8b555097020e2b78ce908dcc"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deb62214c42a261cb3eb04d474f7155279c1a8a8c30ac89b7dcb1721d92c3c02"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcaeb7b57f1a1e071ebd748984359fef83ecb026325b9d4ca847c95bc7311c92"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d454b8749b4bd70dd0a79f428731ee263fa6995f83ccb8bada706e8d1d3ff89d"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d807dc2051abe041b6649681dce568f8e10668e3c1c6543ebae58f2d7e617855"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3c20f0ddeb6e29126d45f89206b8291352b8c5b44384e78a6499d68b52ae511"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b7f19250ceef892adf27f0399b9e5afad019288e9be756d6919cb58892129f51"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4f1ed4749a08379555cebf4650453f14452eaa9c43d0a95c49db50c18b7da075"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dcedf0b42bcb4cfff4101d7771a10532415a6106062f005ab97d1d0ab5681c60"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:39ed0d010457a78f54090fafb5d108501b5aa5604cc22408fc1c0c77eac14344"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bb273176be34a746bdac0b0d7e4e2c467323d13640b736c4c477881a3220a989"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f918a1a130a6dfe1d7fe0f105064141342e7dd1611f2e6a21cd2f5c8cb1cfb3e"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f60012a73aa396be721558caa3a6fd49b3dd0033d1675c6d59c4502e870fcf0c"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d2b1ad682a3dfda2a4e8ad8572f3100f95fad98cb99faf37ff0ddfe9cbf9d03"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:614fdafe9f5f19c63ea02817fa4861c606a59a604a77c8cdef5aa01d28b97921"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa518bcd7600c584bf42e6617ee8132869e877db2f76bcdc281ec6a4113a53ab"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0475242f447cc6cb8a9dd486d68b2ef7fbee84427124c232bff5f63b1fe11e5"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f90a4cd061914a60bd51c68bcb4357086991bd0bb93d8aa66a6da7701370708f"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:def7400461c3a3f26e49078302e1c1b38f6752342c77e3cf72ce91ca69fb1bc1"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:65794e4048ee837494aea3c21a28ad5fc080994dfba5b036cf84de37f7ad5074"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:faefcc78f53a88f3076b7f8be0a8f8d35133a3ecf7f3770895c25f8813460f08"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5b4f105deeffa28bbcdff6c49b34e74903139afa690e35d2d9e3c2c2fba18cec"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdfc3a892927458d98f3d55428ae46b921d1f7543b89382fdb483f5640daaec8"},
+    {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
 ]
 
 [[package]]
 name = "safetensors"
-version = "0.4.3"
+version = "0.4.5"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "safetensors-0.4.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:dcf5705cab159ce0130cd56057f5f3425023c407e170bca60b4868048bae64fd"},
-    {file = "safetensors-0.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bb4f8c5d0358a31e9a08daeebb68f5e161cdd4018855426d3f0c23bb51087055"},
-    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70a5319ef409e7f88686a46607cbc3c428271069d8b770076feaf913664a07ac"},
-    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fb9c65bd82f9ef3ce4970dc19ee86be5f6f93d032159acf35e663c6bea02b237"},
-    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:edb5698a7bc282089f64c96c477846950358a46ede85a1c040e0230344fdde10"},
-    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:efcc860be094b8d19ac61b452ec635c7acb9afa77beb218b1d7784c6d41fe8ad"},
-    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376"},
-    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5fc6775529fb9f0ce2266edd3e5d3f10aab068e49f765e11f6f2a63b5367021d"},
-    {file = "safetensors-0.4.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9c6ad011c1b4e3acff058d6b090f1da8e55a332fbf84695cf3100c649cc452d1"},
-    {file = "safetensors-0.4.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8c496c5401c1b9c46d41a7688e8ff5b0310a3b9bae31ce0f0ae870e1ea2b8caf"},
-    {file = "safetensors-0.4.3-cp310-none-win32.whl", hash = "sha256:38e2a8666178224a51cca61d3cb4c88704f696eac8f72a49a598a93bbd8a4af9"},
-    {file = "safetensors-0.4.3-cp310-none-win_amd64.whl", hash = "sha256:393e6e391467d1b2b829c77e47d726f3b9b93630e6a045b1d1fca67dc78bf632"},
-    {file = "safetensors-0.4.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:22f3b5d65e440cec0de8edaa672efa888030802e11c09b3d6203bff60ebff05a"},
-    {file = "safetensors-0.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c4fa560ebd4522adddb71dcd25d09bf211b5634003f015a4b815b7647d62ebe"},
-    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9afd5358719f1b2cf425fad638fc3c887997d6782da317096877e5b15b2ce93"},
-    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d8c5093206ef4b198600ae484230402af6713dab1bd5b8e231905d754022bec7"},
-    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0b2104df1579d6ba9052c0ae0e3137c9698b2d85b0645507e6fd1813b70931a"},
-    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8cf18888606dad030455d18f6c381720e57fc6a4170ee1966adb7ebc98d4d6a3"},
-    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0bf4f9d6323d9f86eef5567eabd88f070691cf031d4c0df27a40d3b4aaee755b"},
-    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:585c9ae13a205807b63bef8a37994f30c917ff800ab8a1ca9c9b5d73024f97ee"},
-    {file = "safetensors-0.4.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:faefeb3b81bdfb4e5a55b9bbdf3d8d8753f65506e1d67d03f5c851a6c87150e9"},
-    {file = "safetensors-0.4.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:befdf0167ad626f22f6aac6163477fcefa342224a22f11fdd05abb3995c1783c"},
-    {file = "safetensors-0.4.3-cp311-none-win32.whl", hash = "sha256:a7cef55929dcbef24af3eb40bedec35d82c3c2fa46338bb13ecf3c5720af8a61"},
-    {file = "safetensors-0.4.3-cp311-none-win_amd64.whl", hash = "sha256:840b7ac0eff5633e1d053cc9db12fdf56b566e9403b4950b2dc85393d9b88d67"},
-    {file = "safetensors-0.4.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:22d21760dc6ebae42e9c058d75aa9907d9f35e38f896e3c69ba0e7b213033856"},
-    {file = "safetensors-0.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d22c1a10dff3f64d0d68abb8298a3fd88ccff79f408a3e15b3e7f637ef5c980"},
-    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1648568667f820b8c48317c7006221dc40aced1869908c187f493838a1362bc"},
-    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:446e9fe52c051aeab12aac63d1017e0f68a02a92a027b901c4f8e931b24e5397"},
-    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fef5d70683643618244a4f5221053567ca3e77c2531e42ad48ae05fae909f542"},
-    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a1f4430cc0c9d6afa01214a4b3919d0a029637df8e09675ceef1ca3f0dfa0df"},
-    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d603846a8585b9432a0fd415db1d4c57c0f860eb4aea21f92559ff9902bae4d"},
-    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a844cdb5d7cbc22f5f16c7e2a0271170750763c4db08381b7f696dbd2c78a361"},
-    {file = "safetensors-0.4.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:88887f69f7a00cf02b954cdc3034ffb383b2303bc0ab481d4716e2da51ddc10e"},
-    {file = "safetensors-0.4.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee463219d9ec6c2be1d331ab13a8e0cd50d2f32240a81d498266d77d07b7e71e"},
-    {file = "safetensors-0.4.3-cp312-none-win32.whl", hash = "sha256:d0dd4a1db09db2dba0f94d15addc7e7cd3a7b0d393aa4c7518c39ae7374623c3"},
-    {file = "safetensors-0.4.3-cp312-none-win_amd64.whl", hash = "sha256:d14d30c25897b2bf19b6fb5ff7e26cc40006ad53fd4a88244fdf26517d852dd7"},
-    {file = "safetensors-0.4.3-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:d1456f814655b224d4bf6e7915c51ce74e389b413be791203092b7ff78c936dd"},
-    {file = "safetensors-0.4.3-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:455d538aa1aae4a8b279344a08136d3f16334247907b18a5c3c7fa88ef0d3c46"},
-    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf476bca34e1340ee3294ef13e2c625833f83d096cfdf69a5342475602004f95"},
-    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:02ef3a24face643456020536591fbd3c717c5abaa2737ec428ccbbc86dffa7a4"},
-    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7de32d0d34b6623bb56ca278f90db081f85fb9c5d327e3c18fd23ac64f465768"},
-    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a0deb16a1d3ea90c244ceb42d2c6c276059616be21a19ac7101aa97da448faf"},
-    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c59d51f182c729f47e841510b70b967b0752039f79f1de23bcdd86462a9b09ee"},
-    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1f598b713cc1a4eb31d3b3203557ac308acf21c8f41104cdd74bf640c6e538e3"},
-    {file = "safetensors-0.4.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5757e4688f20df083e233b47de43845d1adb7e17b6cf7da5f8444416fc53828d"},
-    {file = "safetensors-0.4.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:fe746d03ed8d193674a26105e4f0fe6c726f5bb602ffc695b409eaf02f04763d"},
-    {file = "safetensors-0.4.3-cp37-none-win32.whl", hash = "sha256:0d5ffc6a80f715c30af253e0e288ad1cd97a3d0086c9c87995e5093ebc075e50"},
-    {file = "safetensors-0.4.3-cp37-none-win_amd64.whl", hash = "sha256:a11c374eb63a9c16c5ed146457241182f310902bd2a9c18255781bb832b6748b"},
-    {file = "safetensors-0.4.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:b1e31be7945f66be23f4ec1682bb47faa3df34cb89fc68527de6554d3c4258a4"},
-    {file = "safetensors-0.4.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03a4447c784917c9bf01d8f2ac5080bc15c41692202cd5f406afba16629e84d6"},
-    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d244bcafeb1bc06d47cfee71727e775bca88a8efda77a13e7306aae3813fa7e4"},
-    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53c4879b9c6bd7cd25d114ee0ef95420e2812e676314300624594940a8d6a91f"},
-    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74707624b81f1b7f2b93f5619d4a9f00934d5948005a03f2c1845ffbfff42212"},
-    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0d52c958dc210265157573f81d34adf54e255bc2b59ded6218500c9b15a750eb"},
-    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f9568f380f513a60139971169c4a358b8731509cc19112369902eddb33faa4d"},
-    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0d9cd8e1560dfc514b6d7859247dc6a86ad2f83151a62c577428d5102d872721"},
-    {file = "safetensors-0.4.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:89f9f17b0dacb913ed87d57afbc8aad85ea42c1085bd5de2f20d83d13e9fc4b2"},
-    {file = "safetensors-0.4.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1139eb436fd201c133d03c81209d39ac57e129f5e74e34bb9ab60f8d9b726270"},
-    {file = "safetensors-0.4.3-cp38-none-win32.whl", hash = "sha256:d9c289f140a9ae4853fc2236a2ffc9a9f2d5eae0cb673167e0f1b8c18c0961ac"},
-    {file = "safetensors-0.4.3-cp38-none-win_amd64.whl", hash = "sha256:622afd28968ef3e9786562d352659a37de4481a4070f4ebac883f98c5836563e"},
-    {file = "safetensors-0.4.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8651c7299cbd8b4161a36cd6a322fa07d39cd23535b144d02f1c1972d0c62f3c"},
-    {file = "safetensors-0.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e375d975159ac534c7161269de24ddcd490df2157b55c1a6eeace6cbb56903f0"},
-    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:084fc436e317f83f7071fc6a62ca1c513b2103db325cd09952914b50f51cf78f"},
-    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:41a727a7f5e6ad9f1db6951adee21bbdadc632363d79dc434876369a17de6ad6"},
-    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7dbbde64b6c534548696808a0e01276d28ea5773bc9a2dfb97a88cd3dffe3df"},
-    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bbae3b4b9d997971431c346edbfe6e41e98424a097860ee872721e176040a893"},
-    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01e4b22e3284cd866edeabe4f4d896229495da457229408d2e1e4810c5187121"},
-    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dd37306546b58d3043eb044c8103a02792cc024b51d1dd16bd3dd1f334cb3ed"},
-    {file = "safetensors-0.4.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d8815b5e1dac85fc534a97fd339e12404db557878c090f90442247e87c8aeaea"},
-    {file = "safetensors-0.4.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e011cc162503c19f4b1fd63dfcddf73739c7a243a17dac09b78e57a00983ab35"},
-    {file = "safetensors-0.4.3-cp39-none-win32.whl", hash = "sha256:01feb3089e5932d7e662eda77c3ecc389f97c0883c4a12b5cfdc32b589a811c3"},
-    {file = "safetensors-0.4.3-cp39-none-win_amd64.whl", hash = "sha256:3f9cdca09052f585e62328c1c2923c70f46814715c795be65f0b93f57ec98a02"},
-    {file = "safetensors-0.4.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:1b89381517891a7bb7d1405d828b2bf5d75528299f8231e9346b8eba092227f9"},
-    {file = "safetensors-0.4.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:cd6fff9e56df398abc5866b19a32124815b656613c1c5ec0f9350906fd798aac"},
-    {file = "safetensors-0.4.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:840caf38d86aa7014fe37ade5d0d84e23dcfbc798b8078015831996ecbc206a3"},
-    {file = "safetensors-0.4.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9650713b2cfa9537a2baf7dd9fee458b24a0aaaa6cafcea8bdd5fb2b8efdc34"},
-    {file = "safetensors-0.4.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e4119532cd10dba04b423e0f86aecb96cfa5a602238c0aa012f70c3a40c44b50"},
-    {file = "safetensors-0.4.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e066e8861eef6387b7c772344d1fe1f9a72800e04ee9a54239d460c400c72aab"},
-    {file = "safetensors-0.4.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:90964917f5b0fa0fa07e9a051fbef100250c04d150b7026ccbf87a34a54012e0"},
-    {file = "safetensors-0.4.3-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c41e1893d1206aa7054029681778d9a58b3529d4c807002c156d58426c225173"},
-    {file = "safetensors-0.4.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7613a119a71a497d012ccc83775c308b9c1dab454806291427f84397d852fd"},
-    {file = "safetensors-0.4.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9bac020faba7f5dc481e881b14b6425265feabb5bfc552551d21189c0eddc3"},
-    {file = "safetensors-0.4.3-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:420a98f593ff9930f5822560d14c395ccbc57342ddff3b463bc0b3d6b1951550"},
-    {file = "safetensors-0.4.3-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f5e6883af9a68c0028f70a4c19d5a6ab6238a379be36ad300a22318316c00cb0"},
-    {file = "safetensors-0.4.3-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:cdd0a3b5da66e7f377474599814dbf5cbf135ff059cc73694de129b58a5e8a2c"},
-    {file = "safetensors-0.4.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9bfb92f82574d9e58401d79c70c716985dc049b635fef6eecbb024c79b2c46ad"},
-    {file = "safetensors-0.4.3-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:3615a96dd2dcc30eb66d82bc76cda2565f4f7bfa89fcb0e31ba3cea8a1a9ecbb"},
-    {file = "safetensors-0.4.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:868ad1b6fc41209ab6bd12f63923e8baeb1a086814cb2e81a65ed3d497e0cf8f"},
-    {file = "safetensors-0.4.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7ffba80aa49bd09195145a7fd233a7781173b422eeb995096f2b30591639517"},
-    {file = "safetensors-0.4.3-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0acbe31340ab150423347e5b9cc595867d814244ac14218932a5cf1dd38eb39"},
-    {file = "safetensors-0.4.3-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:19bbdf95de2cf64f25cd614c5236c8b06eb2cfa47cbf64311f4b5d80224623a3"},
-    {file = "safetensors-0.4.3-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b852e47eb08475c2c1bd8131207b405793bfc20d6f45aff893d3baaad449ed14"},
-    {file = "safetensors-0.4.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5d07cbca5b99babb692d76d8151bec46f461f8ad8daafbfd96b2fca40cadae65"},
-    {file = "safetensors-0.4.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1ab6527a20586d94291c96e00a668fa03f86189b8a9defa2cdd34a1a01acc7d5"},
-    {file = "safetensors-0.4.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02318f01e332cc23ffb4f6716e05a492c5f18b1d13e343c49265149396284a44"},
-    {file = "safetensors-0.4.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec4b52ce9a396260eb9731eb6aea41a7320de22ed73a1042c2230af0212758ce"},
-    {file = "safetensors-0.4.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:018b691383026a2436a22b648873ed11444a364324e7088b99cd2503dd828400"},
-    {file = "safetensors-0.4.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:309b10dbcab63269ecbf0e2ca10ce59223bb756ca5d431ce9c9eeabd446569da"},
-    {file = "safetensors-0.4.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b277482120df46e27a58082df06a15aebda4481e30a1c21eefd0921ae7e03f65"},
-    {file = "safetensors-0.4.3.tar.gz", hash = "sha256:2f85fc50c4e07a21e95c24e07460fe6f7e2859d0ce88092838352b798ce711c2"},
+    {file = "safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7"},
+    {file = "safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6885016f34bef80ea1085b7e99b3c1f92cb1be78a49839203060f67b40aee761"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:133620f443450429322f238fda74d512c4008621227fccf2f8cf4a76206fea7c"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4fb3e0609ec12d2a77e882f07cced530b8262027f64b75d399f1504ffec0ba56"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0f1dd769f064adc33831f5e97ad07babbd728427f98e3e1db6902e369122737"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6d156bdb26732feada84f9388a9f135528c1ef5b05fae153da365ad4319c4c5"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e347d77e2c77eb7624400ccd09bed69d35c0332f417ce8c048d404a096c593b"},
+    {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9f556eea3aec1d3d955403159fe2123ddd68e880f83954ee9b4a3f2e15e716b6"},
+    {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9483f42be3b6bc8ff77dd67302de8ae411c4db39f7224dec66b0eb95822e4163"},
+    {file = "safetensors-0.4.5-cp310-none-win32.whl", hash = "sha256:7389129c03fadd1ccc37fd1ebbc773f2b031483b04700923c3511d2a939252cc"},
+    {file = "safetensors-0.4.5-cp310-none-win_amd64.whl", hash = "sha256:e98ef5524f8b6620c8cdef97220c0b6a5c1cef69852fcd2f174bb96c2bb316b1"},
+    {file = "safetensors-0.4.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:21f848d7aebd5954f92538552d6d75f7c1b4500f51664078b5b49720d180e47c"},
+    {file = "safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb07000b19d41e35eecef9a454f31a8b4718a185293f0d0b1c4b61d6e4487971"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09dedf7c2fda934ee68143202acff6e9e8eb0ddeeb4cfc24182bef999efa9f42"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59b77e4b7a708988d84f26de3ebead61ef1659c73dcbc9946c18f3b1786d2688"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d3bc83e14d67adc2e9387e511097f254bd1b43c3020440e708858c684cbac68"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39371fc551c1072976073ab258c3119395294cf49cdc1f8476794627de3130df"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6c19feda32b931cae0acd42748a670bdf56bee6476a046af20181ad3fee4090"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a659467495de201e2f282063808a41170448c78bada1e62707b07a27b05e6943"},
+    {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bad5e4b2476949bcd638a89f71b6916fa9a5cae5c1ae7eede337aca2100435c0"},
+    {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a3a315a6d0054bc6889a17f5668a73f94f7fe55121ff59e0a199e3519c08565f"},
+    {file = "safetensors-0.4.5-cp311-none-win32.whl", hash = "sha256:a01e232e6d3d5cf8b1667bc3b657a77bdab73f0743c26c1d3c5dd7ce86bd3a92"},
+    {file = "safetensors-0.4.5-cp311-none-win_amd64.whl", hash = "sha256:cbd39cae1ad3e3ef6f63a6f07296b080c951f24cec60188378e43d3713000c04"},
+    {file = "safetensors-0.4.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:473300314e026bd1043cef391bb16a8689453363381561b8a3e443870937cc1e"},
+    {file = "safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:801183a0f76dc647f51a2d9141ad341f9665602a7899a693207a82fb102cc53e"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1524b54246e422ad6fb6aea1ac71edeeb77666efa67230e1faf6999df9b2e27f"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3139098e3e8b2ad7afbca96d30ad29157b50c90861084e69fcb80dec7430461"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65573dc35be9059770808e276b017256fa30058802c29e1038eb1c00028502ea"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd33da8e9407559f8779c82a0448e2133737f922d71f884da27184549416bfed"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3685ce7ed036f916316b567152482b7e959dc754fcc4a8342333d222e05f407c"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dde2bf390d25f67908278d6f5d59e46211ef98e44108727084d4637ee70ab4f1"},
+    {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7469d70d3de970b1698d47c11ebbf296a308702cbaae7fcb993944751cf985f4"},
+    {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a6ba28118636a130ccbb968bc33d4684c48678695dba2590169d5ab03a45646"},
+    {file = "safetensors-0.4.5-cp312-none-win32.whl", hash = "sha256:c859c7ed90b0047f58ee27751c8e56951452ed36a67afee1b0a87847d065eec6"},
+    {file = "safetensors-0.4.5-cp312-none-win_amd64.whl", hash = "sha256:b5a8810ad6a6f933fff6c276eae92c1da217b39b4d8b1bc1c0b8af2d270dc532"},
+    {file = "safetensors-0.4.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:25e5f8e2e92a74f05b4ca55686234c32aac19927903792b30ee6d7bd5653d54e"},
+    {file = "safetensors-0.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81efb124b58af39fcd684254c645e35692fea81c51627259cdf6d67ff4458916"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:585f1703a518b437f5103aa9cf70e9bd437cb78eea9c51024329e4fb8a3e3679"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b99fbf72e3faf0b2f5f16e5e3458b93b7d0a83984fe8d5364c60aa169f2da89"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b17b299ca9966ca983ecda1c0791a3f07f9ca6ab5ded8ef3d283fff45f6bcd5f"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76ded72f69209c9780fdb23ea89e56d35c54ae6abcdec67ccb22af8e696e449a"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2783956926303dcfeb1de91a4d1204cd4089ab441e622e7caee0642281109db3"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d94581aab8c6b204def4d7320f07534d6ee34cd4855688004a4354e63b639a35"},
+    {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:67e1e7cb8678bb1b37ac48ec0df04faf689e2f4e9e81e566b5c63d9f23748523"},
+    {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:dbd280b07e6054ea68b0cb4b16ad9703e7d63cd6890f577cb98acc5354780142"},
+    {file = "safetensors-0.4.5-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77d9b228da8374c7262046a36c1f656ba32a93df6cc51cd4453af932011e77f1"},
+    {file = "safetensors-0.4.5-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:500cac01d50b301ab7bb192353317035011c5ceeef0fca652f9f43c000bb7f8d"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75331c0c746f03158ded32465b7d0b0e24c5a22121743662a2393439c43a45cf"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670e95fe34e0d591d0529e5e59fd9d3d72bc77b1444fcaa14dccda4f36b5a38b"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:098923e2574ff237c517d6e840acada8e5b311cb1fa226019105ed82e9c3b62f"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ca0902d2648775089fa6a0c8fc9e6390c5f8ee576517d33f9261656f851e3f"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f0032bedc869c56f8d26259fe39cd21c5199cd57f2228d817a0e23e8370af25"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4b15f51b4f8f2a512341d9ce3475cacc19c5fdfc5db1f0e19449e75f95c7dc8"},
+    {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f6594d130d0ad933d885c6a7b75c5183cb0e8450f799b80a39eae2b8508955eb"},
+    {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:60c828a27e852ded2c85fc0f87bf1ec20e464c5cd4d56ff0e0711855cc2e17f8"},
+    {file = "safetensors-0.4.5-cp37-none-win32.whl", hash = "sha256:6d3de65718b86c3eeaa8b73a9c3d123f9307a96bbd7be9698e21e76a56443af5"},
+    {file = "safetensors-0.4.5-cp37-none-win_amd64.whl", hash = "sha256:5a2d68a523a4cefd791156a4174189a4114cf0bf9c50ceb89f261600f3b2b81a"},
+    {file = "safetensors-0.4.5-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:e7a97058f96340850da0601a3309f3d29d6191b0702b2da201e54c6e3e44ccf0"},
+    {file = "safetensors-0.4.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:63bfd425e25f5c733f572e2246e08a1c38bd6f2e027d3f7c87e2e43f228d1345"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3664ac565d0e809b0b929dae7ccd74e4d3273cd0c6d1220c6430035befb678e"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:313514b0b9b73ff4ddfb4edd71860696dbe3c1c9dc4d5cc13dbd74da283d2cbf"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31fa33ee326f750a2f2134a6174773c281d9a266ccd000bd4686d8021f1f3dac"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09566792588d77b68abe53754c9f1308fadd35c9f87be939e22c623eaacbed6b"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309aaec9b66cbf07ad3a2e5cb8a03205663324fea024ba391594423d0f00d9fe"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:53946c5813b8f9e26103c5efff4a931cc45d874f45229edd68557ffb35ffb9f8"},
+    {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:868f9df9e99ad1e7f38c52194063a982bc88fedc7d05096f4f8160403aaf4bd6"},
+    {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9cc9449bd0b0bc538bd5e268221f0c5590bc5c14c1934a6ae359d44410dc68c4"},
+    {file = "safetensors-0.4.5-cp38-none-win32.whl", hash = "sha256:83c4f13a9e687335c3928f615cd63a37e3f8ef072a3f2a0599fa09f863fb06a2"},
+    {file = "safetensors-0.4.5-cp38-none-win_amd64.whl", hash = "sha256:b98d40a2ffa560653f6274e15b27b3544e8e3713a44627ce268f419f35c49478"},
+    {file = "safetensors-0.4.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:cf727bb1281d66699bef5683b04d98c894a2803442c490a8d45cd365abfbdeb2"},
+    {file = "safetensors-0.4.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96f1d038c827cdc552d97e71f522e1049fef0542be575421f7684756a748e457"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:139fbee92570ecea774e6344fee908907db79646d00b12c535f66bc78bd5ea2c"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c36302c1c69eebb383775a89645a32b9d266878fab619819ce660309d6176c9b"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d641f5b8149ea98deb5ffcf604d764aad1de38a8285f86771ce1abf8e74c4891"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b4db6a61d968de73722b858038c616a1bebd4a86abe2688e46ca0cc2d17558f2"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b75a616e02f21b6f1d5785b20cecbab5e2bd3f6358a90e8925b813d557666ec1"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:788ee7d04cc0e0e7f944c52ff05f52a4415b312f5efd2ee66389fb7685ee030c"},
+    {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87bc42bd04fd9ca31396d3ca0433db0be1411b6b53ac5a32b7845a85d01ffc2e"},
+    {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4037676c86365a721a8c9510323a51861d703b399b78a6b4486a54a65a975fca"},
+    {file = "safetensors-0.4.5-cp39-none-win32.whl", hash = "sha256:1500418454529d0ed5c1564bda376c4ddff43f30fce9517d9bee7bcce5a8ef50"},
+    {file = "safetensors-0.4.5-cp39-none-win_amd64.whl", hash = "sha256:9d1a94b9d793ed8fe35ab6d5cea28d540a46559bafc6aae98f30ee0867000cab"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fdadf66b5a22ceb645d5435a0be7a0292ce59648ca1d46b352f13cff3ea80410"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d42ffd4c2259f31832cb17ff866c111684c87bd930892a1ba53fed28370c918c"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd8a1f6d2063a92cd04145c7fd9e31a1c7d85fbec20113a14b487563fdbc0597"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951d2fcf1817f4fb0ef0b48f6696688a4e852a95922a042b3f96aaa67eedc920"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ac85d9a8c1af0e3132371d9f2d134695a06a96993c2e2f0bbe25debb9e3f67a"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e3cec4a29eb7fe8da0b1c7988bc3828183080439dd559f720414450de076fcab"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c7db3006a4915151ce1913652e907cdede299b974641a83fbc092102ac41b644"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f68bf99ea970960a237f416ea394e266e0361895753df06e3e06e6ea7907d98b"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8158938cf3324172df024da511839d373c40fbfaa83e9abf467174b2910d7b4c"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:540ce6c4bf6b58cb0fd93fa5f143bc0ee341c93bb4f9287ccd92cf898cc1b0dd"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bfeaa1a699c6b9ed514bd15e6a91e74738b71125a9292159e3d6b7f0a53d2cde"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:01c8f00da537af711979e1b42a69a8ec9e1d7112f208e0e9b8a35d2c381085ef"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a0dd565f83b30f2ca79b5d35748d0d99dd4b3454f80e03dfb41f0038e3bdf180"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:023b6e5facda76989f4cba95a861b7e656b87e225f61811065d5c501f78cdb3f"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9633b663393d5796f0b60249549371e392b75a0b955c07e9c6f8708a87fc841f"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78dd8adfb48716233c45f676d6e48534d34b4bceb50162c13d1f0bdf6f78590a"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8e8deb16c4321d61ae72533b8451ec4a9af8656d1c61ff81aa49f966406e4b68"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:52452fa5999dc50c4decaf0c53aa28371f7f1e0fe5c2dd9129059fbe1e1599c7"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d5f23198821e227cfc52d50fa989813513db381255c6d100927b012f0cfec63d"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f4beb84b6073b1247a773141a6331117e35d07134b3bb0383003f39971d414bb"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:68814d599d25ed2fdd045ed54d370d1d03cf35e02dce56de44c651f828fb9b7b"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b6453c54c57c1781292c46593f8a37254b8b99004c68d6c3ce229688931a22"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adaa9c6dead67e2dd90d634f89131e43162012479d86e25618e821a03d1eb1dc"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73e7d408e9012cd17511b382b43547850969c7979efc2bc353f317abaf23c84c"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:775409ce0fcc58b10773fdb4221ed1eb007de10fe7adbdf8f5e8a56096b6f0bc"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:834001bed193e4440c4a3950a31059523ee5090605c907c66808664c932b549c"},
+    {file = "safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310"},
 ]
 
 [package.extras]
@@ -2718,72 +3145,85 @@ test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "po
 
 [[package]]
 name = "sentencepiece"
-version = "0.1.99"
+version = "0.2.0"
 description = "SentencePiece python wrapper"
 optional = false
 python-versions = "*"
 files = [
-    {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0eb528e70571b7c02723e5804322469b82fe7ea418c96051d0286c0fa028db73"},
-    {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d7fafb2c4e4659cbdf303929503f37a26eabc4ff31d3a79bf1c5a1b338caa7"},
-    {file = "sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be9cf5b9e404c245aeb3d3723c737ba7a8f5d4ba262ef233a431fa6c45f732a0"},
-    {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baed1a26464998f9710d20e52607c29ffd4293e7c71c6a1f83f51ad0911ec12c"},
-    {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9832f08bb372d4c8b567612f8eab9e36e268dff645f1c28f9f8e851be705f6d1"},
-    {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:019e7535108e309dae2b253a75834fc3128240aa87c00eb80732078cdc182588"},
-    {file = "sentencepiece-0.1.99-cp310-cp310-win32.whl", hash = "sha256:fa16a830416bb823fa2a52cbdd474d1f7f3bba527fd2304fb4b140dad31bb9bc"},
-    {file = "sentencepiece-0.1.99-cp310-cp310-win_amd64.whl", hash = "sha256:14b0eccb7b641d4591c3e12ae44cab537d68352e4d3b6424944f0c447d2348d5"},
-    {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6d3c56f24183a1e8bd61043ff2c58dfecdc68a5dd8955dc13bab83afd5f76b81"},
-    {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed6ea1819fd612c989999e44a51bf556d0ef6abfb553080b9be3d347e18bcfb7"},
-    {file = "sentencepiece-0.1.99-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2a0260cd1fb7bd8b4d4f39dc2444a8d5fd4e0a0c4d5c899810ef1abf99b2d45"},
-    {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a1abff4d1ff81c77cac3cc6fefa34fa4b8b371e5ee51cb7e8d1ebc996d05983"},
-    {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004e6a621d4bc88978eecb6ea7959264239a17b70f2cbc348033d8195c9808ec"},
-    {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db361e03342c41680afae5807590bc88aa0e17cfd1a42696a160e4005fcda03b"},
-    {file = "sentencepiece-0.1.99-cp311-cp311-win32.whl", hash = "sha256:2d95e19168875b70df62916eb55428a0cbcb834ac51d5a7e664eda74def9e1e0"},
-    {file = "sentencepiece-0.1.99-cp311-cp311-win_amd64.whl", hash = "sha256:f90d73a6f81248a909f55d8e6ef56fec32d559e1e9af045f0b0322637cb8e5c7"},
-    {file = "sentencepiece-0.1.99-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:62e24c81e74bd87a6e0d63c51beb6527e4c0add67e1a17bac18bcd2076afcfeb"},
-    {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57efcc2d51caff20d9573567d9fd3f854d9efe613ed58a439c78c9f93101384a"},
-    {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a904c46197993bd1e95b93a6e373dca2f170379d64441041e2e628ad4afb16f"},
-    {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d89adf59854741c0d465f0e1525b388c0d174f611cc04af54153c5c4f36088c4"},
-    {file = "sentencepiece-0.1.99-cp36-cp36m-win32.whl", hash = "sha256:47c378146928690d1bc106fdf0da768cebd03b65dd8405aa3dd88f9c81e35dba"},
-    {file = "sentencepiece-0.1.99-cp36-cp36m-win_amd64.whl", hash = "sha256:9ba142e7a90dd6d823c44f9870abdad45e6c63958eb60fe44cca6828d3b69da2"},
-    {file = "sentencepiece-0.1.99-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7b1a9ae4d7c6f1f867e63370cca25cc17b6f4886729595b885ee07a58d3cec3"},
-    {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0f644c9d4d35c096a538507b2163e6191512460035bf51358794a78515b74f7"},
-    {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c8843d23a0f686d85e569bd6dcd0dd0e0cbc03731e63497ca6d5bacd18df8b85"},
-    {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33e6f690a1caebb4867a2e367afa1918ad35be257ecdb3455d2bbd787936f155"},
-    {file = "sentencepiece-0.1.99-cp37-cp37m-win32.whl", hash = "sha256:8a321866c2f85da7beac74a824b4ad6ddc2a4c9bccd9382529506d48f744a12c"},
-    {file = "sentencepiece-0.1.99-cp37-cp37m-win_amd64.whl", hash = "sha256:c42f753bcfb7661c122a15b20be7f684b61fc8592c89c870adf52382ea72262d"},
-    {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:85b476406da69c70586f0bb682fcca4c9b40e5059814f2db92303ea4585c650c"},
-    {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cfbcfe13c69d3f87b7fcd5da168df7290a6d006329be71f90ba4f56bc77f8561"},
-    {file = "sentencepiece-0.1.99-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:445b0ec381af1cd4eef95243e7180c63d9c384443c16c4c47a28196bd1cda937"},
-    {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6890ea0f2b4703f62d0bf27932e35808b1f679bdb05c7eeb3812b935ba02001"},
-    {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb71af492b0eefbf9f2501bec97bcd043b6812ab000d119eaf4bd33f9e283d03"},
-    {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27b866b5bd3ddd54166bbcbf5c8d7dd2e0b397fac8537991c7f544220b1f67bc"},
-    {file = "sentencepiece-0.1.99-cp38-cp38-win32.whl", hash = "sha256:b133e8a499eac49c581c3c76e9bdd08c338cc1939e441fee6f92c0ccb5f1f8be"},
-    {file = "sentencepiece-0.1.99-cp38-cp38-win_amd64.whl", hash = "sha256:0eaf3591dd0690a87f44f4df129cf8d05d8a4029b5b6709b489b8e27f9a9bcff"},
-    {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38efeda9bbfb55052d482a009c6a37e52f42ebffcea9d3a98a61de7aee356a28"},
-    {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c030b081dc1e1bcc9fadc314b19b740715d3d566ad73a482da20d7d46fd444c"},
-    {file = "sentencepiece-0.1.99-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:84dbe53e02e4f8a2e45d2ac3e430d5c83182142658e25edd76539b7648928727"},
-    {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b0f55d0a0ee1719b4b04221fe0c9f0c3461dc3dabd77a035fa2f4788eb3ef9a"},
-    {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18e800f206cd235dc27dc749299e05853a4e4332e8d3dfd81bf13d0e5b9007d9"},
-    {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ae1c40cda8f9d5b0423cfa98542735c0235e7597d79caf318855cdf971b2280"},
-    {file = "sentencepiece-0.1.99-cp39-cp39-win32.whl", hash = "sha256:c84ce33af12ca222d14a1cdd37bd76a69401e32bc68fe61c67ef6b59402f4ab8"},
-    {file = "sentencepiece-0.1.99-cp39-cp39-win_amd64.whl", hash = "sha256:350e5c74d739973f1c9643edb80f7cc904dc948578bcb1d43c6f2b173e5d18dd"},
-    {file = "sentencepiece-0.1.99.tar.gz", hash = "sha256:189c48f5cb2949288f97ccdb97f0473098d9c3dcf5a3d99d4eabe719ec27297f"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7b67e724bead13f18db6e1d10b6bbdc454af574d70efbb36f27d90387be1ca3"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fde4b08cfe237be4484c6c7c2e2c75fb862cfeab6bd5449ce4caeafd97b767a"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c378492056202d1c48a4979650981635fd97875a00eabb1f00c6a236b013b5e"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1380ce6540a368de2ef6d7e6ba14ba8f3258df650d39ba7d833b79ee68a52040"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-win32.whl", hash = "sha256:a1151d6a6dd4b43e552394aed0edfe9292820272f0194bd56c7c1660a0c06c3d"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:d490142b0521ef22bc1085f061d922a2a6666175bb6b42e588ff95c0db6819b2"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:17982700c4f6dbb55fa3594f3d7e5dd1c8659a274af3738e33c987d2a27c9d5c"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7c867012c0e8bcd5bdad0f791609101cb5c66acb303ab3270218d6debc68a65e"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7fd6071249c74f779c5b27183295b9202f8dedb68034e716784364443879eaa6"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f90c55a65013cbb8f4d7aab0599bf925cde4adc67ae43a0d323677b5a1c6cb"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b293734059ef656dcd65be62ff771507bea8fed0a711b6733976e1ed3add4553"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e58b47f933aca74c6a60a79dcb21d5b9e47416256c795c2d58d55cec27f9551d"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-win32.whl", hash = "sha256:c581258cf346b327c62c4f1cebd32691826306f6a41d8c4bec43b010dee08e75"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0993dbc665f4113017892f1b87c3904a44d0640eda510abcacdfb07f74286d36"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ea5f536e32ea8ec96086ee00d7a4a131ce583a1b18d130711707c10e69601cb2"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d0cb51f53b6aae3c36bafe41e86167c71af8370a039f542c43b0cce5ef24a68c"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3212121805afc58d8b00ab4e7dd1f8f76c203ddb9dc94aa4079618a31cf5da0f"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a3149e3066c2a75e0d68a43eb632d7ae728c7925b517f4c05c40f6f7280ce08"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:632f3594d3e7ac8b367bca204cb3fd05a01d5b21455acd097ea4c0e30e2f63d7"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f295105c6bdbb05bd5e1b0cafbd78ff95036f5d3641e7949455a3f4e5e7c3109"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-win32.whl", hash = "sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a673a72aab81fef5ebe755c6e0cc60087d1f3a4700835d40537183c1703a45f"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4547683f330289ec4f093027bfeb87f9ef023b2eb6f879fdc4a8187c7e0ffb90"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd6175f7eaec7142d2bf6f6597ce7db4c9ac89acf93fcdb17410c3a8b781eeb"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:859ba1acde782609a0910a26a60e16c191a82bf39b5621107552c0cd79fad00f"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcbbef6cc277f8f18f36959e305f10b1c620442d75addc79c21d7073ae581b50"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-win32.whl", hash = "sha256:536b934e244829e3fe6c4f198652cd82da48adb9aa145c9f00889542726dee3d"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:0a91aaa3c769b52440df56fafda683b3aa48e3f2169cf7ee5b8c8454a7f3ae9b"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:787e480ca4c1d08c9985a7eb1eae4345c107729c99e9b5a9a00f2575fc7d4b4b"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4d158189eb2ecffea3a51edf6d25e110b3678ec47f1a40f2d541eafbd8f6250"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1e5ca43013e8935f25457a4fca47e315780172c3e821b4b13a890668911c792"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7140d9e5a74a0908493bb4a13f1f16a401297bd755ada4c707e842fbf6f0f5bf"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-win32.whl", hash = "sha256:6cf333625234f247ab357b0bd9836638405ea9082e1543d5b8408f014979dcbf"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ff88712338b01031910e8e61e7239aff3ce8869ee31a47df63cb38aadd591bea"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20813a68d4c221b1849c62c30e1281ea81687894d894b8d4a0f4677d9311e0f5"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:926ef920ae2e8182db31d3f5d081ada57804e3e1d3a8c4ef8b117f9d9fb5a945"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:89f65f69636b7e9c015b79dff9c9985a9bc7d19ded6f79ef9f1ec920fdd73ecf"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f67eae0dbe6f2d7d6ba50a354623d787c99965f068b81e145d53240198021b0"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:98501e075f35dd1a1d5a20f65be26839fcb1938752ec61539af008a5aa6f510b"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3d1d2cc4882e8d6a1adf9d5927d7716f80617fc693385661caff21888972269"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-win32.whl", hash = "sha256:b99a308a2e5e569031ab164b74e6fab0b6f37dfb493c32f7816225f4d411a6dd"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:cdb701eec783d3ec86b7cd4c763adad8eaf6b46db37ee1c36e5e6c44b3fe1b5f"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1e0f9c4d0a6b0af59b613175f019916e28ade076e21242fd5be24340d8a2f64a"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:298f21cc1366eb60311aedba3169d30f885c363ddbf44214b0a587d2908141ad"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f1ec95aa1e5dab11f37ac7eff190493fd87770f7a8b81ebc9dd768d1a3c8704"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b06b70af54daa4b4904cbb90b4eb6d35c9f3252fdc86c9c32d5afd4d30118d8"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22e37bac44dd6603388cb598c64ff7a76e41ca774646f21c23aadfbf5a2228ab"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-win32.whl", hash = "sha256:38aed822fb76435fa1f12185f10465a94ab9e51d5e8a9159e9a540ce926f0ffd"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8cf876516548b5a1d6ac4745d8b554f5c07891d55da557925e5c13ff0b4e6ad"},
+    {file = "sentencepiece-0.2.0.tar.gz", hash = "sha256:a52c19171daaf2e697dc6cbe67684e0fa341b1248966f6aebb541de654d15843"},
 ]
 
 [[package]]
 name = "setuptools"
-version = "70.0.0"
+version = "75.2.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
-    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
+    {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"},
+    {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"},
 ]
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"]
+core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
+type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"]
 
 [[package]]
 name = "six"
@@ -2798,30 +3238,20 @@ files = [
 
 [[package]]
 name = "sympy"
-version = "1.12.1"
+version = "1.13.1"
 description = "Computer algebra system (CAS) in Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"},
-    {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"},
+    {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"},
+    {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"},
 ]
 
 [package.dependencies]
-mpmath = ">=1.1.0,<1.4.0"
+mpmath = ">=1.1.0,<1.4"
 
-[[package]]
-name = "tbb"
-version = "2021.12.0"
-description = "Intel® oneAPI Threading Building Blocks (oneTBB)"
-optional = true
-python-versions = "*"
-files = [
-    {file = "tbb-2021.12.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:f2cc9a7f8ababaa506cbff796ce97c3bf91062ba521e15054394f773375d81d8"},
-    {file = "tbb-2021.12.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:a925e9a7c77d3a46ae31c34b0bb7f801c4118e857d137b68f68a8e458fcf2bd7"},
-    {file = "tbb-2021.12.0-py3-none-win32.whl", hash = "sha256:b1725b30c174048edc8be70bd43bb95473f396ce895d91151a474d0fa9f450a8"},
-    {file = "tbb-2021.12.0-py3-none-win_amd64.whl", hash = "sha256:fc2772d850229f2f3df85f1109c4844c495a2db7433d38200959ee9265b34789"},
-]
+[package.extras]
+dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 
 [[package]]
 name = "texttable"
@@ -2836,111 +3266,111 @@ files = [
 
 [[package]]
 name = "tokenizers"
-version = "0.19.1"
+version = "0.20.1"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tokenizers-0.19.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:952078130b3d101e05ecfc7fc3640282d74ed26bcf691400f872563fca15ac97"},
-    {file = "tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:82c8b8063de6c0468f08e82c4e198763e7b97aabfe573fd4cf7b33930ca4df77"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f03727225feaf340ceeb7e00604825addef622d551cbd46b7b775ac834c1e1c4"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:453e4422efdfc9c6b6bf2eae00d5e323f263fff62b29a8c9cd526c5003f3f642"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:02e81bf089ebf0e7f4df34fa0207519f07e66d8491d963618252f2e0729e0b46"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b07c538ba956843833fee1190cf769c60dc62e1cf934ed50d77d5502194d63b1"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28cab1582e0eec38b1f38c1c1fb2e56bce5dc180acb1724574fc5f47da2a4fe"},
-    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e"},
-    {file = "tokenizers-0.19.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7fb297edec6c6841ab2e4e8f357209519188e4a59b557ea4fafcf4691d1b4c98"},
-    {file = "tokenizers-0.19.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2e8a3dd055e515df7054378dc9d6fa8c8c34e1f32777fb9a01fea81496b3f9d3"},
-    {file = "tokenizers-0.19.1-cp310-none-win32.whl", hash = "sha256:7ff898780a155ea053f5d934925f3902be2ed1f4d916461e1a93019cc7250837"},
-    {file = "tokenizers-0.19.1-cp310-none-win_amd64.whl", hash = "sha256:bea6f9947e9419c2fda21ae6c32871e3d398cba549b93f4a65a2d369662d9403"},
-    {file = "tokenizers-0.19.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:5c88d1481f1882c2e53e6bb06491e474e420d9ac7bdff172610c4f9ad3898059"},
-    {file = "tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ddf672ed719b4ed82b51499100f5417d7d9f6fb05a65e232249268f35de5ed14"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:dadc509cc8a9fe460bd274c0e16ac4184d0958117cf026e0ea8b32b438171594"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfedf31824ca4915b511b03441784ff640378191918264268e6923da48104acc"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac11016d0a04aa6487b1513a3a36e7bee7eec0e5d30057c9c0408067345c48d2"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76951121890fea8330d3a0df9a954b3f2a37e3ec20e5b0530e9a0044ca2e11fe"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b342d2ce8fc8d00f376af068e3274e2e8649562e3bc6ae4a67784ded6b99428d"},
-    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d16ff18907f4909dca9b076b9c2d899114dd6abceeb074eca0c93e2353f943aa"},
-    {file = "tokenizers-0.19.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:706a37cc5332f85f26efbe2bdc9ef8a9b372b77e4645331a405073e4b3a8c1c6"},
-    {file = "tokenizers-0.19.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:16baac68651701364b0289979ecec728546133e8e8fe38f66fe48ad07996b88b"},
-    {file = "tokenizers-0.19.1-cp311-none-win32.whl", hash = "sha256:9ed240c56b4403e22b9584ee37d87b8bfa14865134e3e1c3fb4b2c42fafd3256"},
-    {file = "tokenizers-0.19.1-cp311-none-win_amd64.whl", hash = "sha256:ad57d59341710b94a7d9dbea13f5c1e7d76fd8d9bcd944a7a6ab0b0da6e0cc66"},
-    {file = "tokenizers-0.19.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:621d670e1b1c281a1c9698ed89451395d318802ff88d1fc1accff0867a06f153"},
-    {file = "tokenizers-0.19.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d924204a3dbe50b75630bd16f821ebda6a5f729928df30f582fb5aade90c818a"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4f3fefdc0446b1a1e6d81cd4c07088ac015665d2e812f6dbba4a06267d1a2c95"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9620b78e0b2d52ef07b0d428323fb34e8ea1219c5eac98c2596311f20f1f9266"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04ce49e82d100594715ac1b2ce87d1a36e61891a91de774755f743babcd0dd52"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5c2ff13d157afe413bf7e25789879dd463e5a4abfb529a2d8f8473d8042e28f"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3174c76efd9d08f836bfccaca7cfec3f4d1c0a4cf3acbc7236ad577cc423c840"},
-    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c9d5b6c0e7a1e979bec10ff960fae925e947aab95619a6fdb4c1d8ff3708ce3"},
-    {file = "tokenizers-0.19.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a179856d1caee06577220ebcfa332af046d576fb73454b8f4d4b0ba8324423ea"},
-    {file = "tokenizers-0.19.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:952b80dac1a6492170f8c2429bd11fcaa14377e097d12a1dbe0ef2fb2241e16c"},
-    {file = "tokenizers-0.19.1-cp312-none-win32.whl", hash = "sha256:01d62812454c188306755c94755465505836fd616f75067abcae529c35edeb57"},
-    {file = "tokenizers-0.19.1-cp312-none-win_amd64.whl", hash = "sha256:b70bfbe3a82d3e3fb2a5e9b22a39f8d1740c96c68b6ace0086b39074f08ab89a"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:bb9dfe7dae85bc6119d705a76dc068c062b8b575abe3595e3c6276480e67e3f1"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:1f0360cbea28ea99944ac089c00de7b2e3e1c58f479fb8613b6d8d511ce98267"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:71e3ec71f0e78780851fef28c2a9babe20270404c921b756d7c532d280349214"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b82931fa619dbad979c0ee8e54dd5278acc418209cc897e42fac041f5366d626"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e8ff5b90eabdcdaa19af697885f70fe0b714ce16709cf43d4952f1f85299e73a"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e742d76ad84acbdb1a8e4694f915fe59ff6edc381c97d6dfdd054954e3478ad4"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d8c5d59d7b59885eab559d5bc082b2985555a54cda04dda4c65528d90ad252ad"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b2da5c32ed869bebd990c9420df49813709e953674c0722ff471a116d97b22d"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:638e43936cc8b2cbb9f9d8dde0fe5e7e30766a3318d2342999ae27f68fdc9bd6"},
-    {file = "tokenizers-0.19.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:78e769eb3b2c79687d9cb0f89ef77223e8e279b75c0a968e637ca7043a84463f"},
-    {file = "tokenizers-0.19.1-cp37-none-win32.whl", hash = "sha256:72791f9bb1ca78e3ae525d4782e85272c63faaef9940d92142aa3eb79f3407a3"},
-    {file = "tokenizers-0.19.1-cp37-none-win_amd64.whl", hash = "sha256:f3bbb7a0c5fcb692950b041ae11067ac54826204318922da754f908d95619fbc"},
-    {file = "tokenizers-0.19.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:07f9295349bbbcedae8cefdbcfa7f686aa420be8aca5d4f7d1ae6016c128c0c5"},
-    {file = "tokenizers-0.19.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:10a707cc6c4b6b183ec5dbfc5c34f3064e18cf62b4a938cb41699e33a99e03c1"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6309271f57b397aa0aff0cbbe632ca9d70430839ca3178bf0f06f825924eca22"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ad23d37d68cf00d54af184586d79b84075ada495e7c5c0f601f051b162112dc"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:427c4f0f3df9109314d4f75b8d1f65d9477033e67ffaec4bca53293d3aca286d"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e83a31c9cf181a0a3ef0abad2b5f6b43399faf5da7e696196ddd110d332519ee"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c27b99889bd58b7e301468c0838c5ed75e60c66df0d4db80c08f43462f82e0d3"},
-    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac0b0eb952412b0b196ca7a40e7dce4ed6f6926489313414010f2e6b9ec2adf"},
-    {file = "tokenizers-0.19.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8a6298bde623725ca31c9035a04bf2ef63208d266acd2bed8c2cb7d2b7d53ce6"},
-    {file = "tokenizers-0.19.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:08a44864e42fa6d7d76d7be4bec62c9982f6f6248b4aa42f7302aa01e0abfd26"},
-    {file = "tokenizers-0.19.1-cp38-none-win32.whl", hash = "sha256:1de5bc8652252d9357a666e609cb1453d4f8e160eb1fb2830ee369dd658e8975"},
-    {file = "tokenizers-0.19.1-cp38-none-win_amd64.whl", hash = "sha256:0bcce02bf1ad9882345b34d5bd25ed4949a480cf0e656bbd468f4d8986f7a3f1"},
-    {file = "tokenizers-0.19.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:0b9394bd204842a2a1fd37fe29935353742be4a3460b6ccbaefa93f58a8df43d"},
-    {file = "tokenizers-0.19.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4692ab92f91b87769d950ca14dbb61f8a9ef36a62f94bad6c82cc84a51f76f6a"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6258c2ef6f06259f70a682491c78561d492e885adeaf9f64f5389f78aa49a051"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85cf76561fbd01e0d9ea2d1cbe711a65400092bc52b5242b16cfd22e51f0c58"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670b802d4d82bbbb832ddb0d41df7015b3e549714c0e77f9bed3e74d42400fbe"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:85aa3ab4b03d5e99fdd31660872249df5e855334b6c333e0bc13032ff4469c4a"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbf001afbbed111a79ca47d75941e9e5361297a87d186cbfc11ed45e30b5daba"},
-    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c89aa46c269e4e70c4d4f9d6bc644fcc39bb409cb2a81227923404dd6f5227"},
-    {file = "tokenizers-0.19.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:39c1ec76ea1027438fafe16ecb0fb84795e62e9d643444c1090179e63808c69d"},
-    {file = "tokenizers-0.19.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c2a0d47a89b48d7daa241e004e71fb5a50533718897a4cd6235cb846d511a478"},
-    {file = "tokenizers-0.19.1-cp39-none-win32.whl", hash = "sha256:61b7fe8886f2e104d4caf9218b157b106207e0f2a4905c9c7ac98890688aabeb"},
-    {file = "tokenizers-0.19.1-cp39-none-win_amd64.whl", hash = "sha256:f97660f6c43efd3e0bfd3f2e3e5615bf215680bad6ee3d469df6454b8c6e8256"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3b11853f17b54c2fe47742c56d8a33bf49ce31caf531e87ac0d7d13d327c9334"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d26194ef6c13302f446d39972aaa36a1dda6450bc8949f5eb4c27f51191375bd"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e8d1ed93beda54bbd6131a2cb363a576eac746d5c26ba5b7556bc6f964425594"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca407133536f19bdec44b3da117ef0d12e43f6d4b56ac4c765f37eca501c7bda"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce05fde79d2bc2e46ac08aacbc142bead21614d937aac950be88dc79f9db9022"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:35583cd46d16f07c054efd18b5d46af4a2f070a2dd0a47914e66f3ff5efb2b1e"},
-    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:43350270bfc16b06ad3f6f07eab21f089adb835544417afda0f83256a8bf8b75"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b4399b59d1af5645bcee2072a463318114c39b8547437a7c2d6a186a1b5a0e2d"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6852c5b2a853b8b0ddc5993cd4f33bfffdca4fcc5d52f89dd4b8eada99379285"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bcd266ae85c3d39df2f7e7d0e07f6c41a55e9a3123bb11f854412952deacd828"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ecb2651956eea2aa0a2d099434134b1b68f1c31f9a5084d6d53f08ed43d45ff2"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:b279ab506ec4445166ac476fb4d3cc383accde1ea152998509a94d82547c8e2a"},
-    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:89183e55fb86e61d848ff83753f64cded119f5d6e1f553d14ffee3700d0a4a49"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2edbc75744235eea94d595a8b70fe279dd42f3296f76d5a86dde1d46e35f574"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:0e64bfde9a723274e9a71630c3e9494ed7b4c0f76a1faacf7fe294cd26f7ae7c"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0b5ca92bfa717759c052e345770792d02d1f43b06f9e790ca0a1db62838816f3"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f8a20266e695ec9d7a946a019c1d5ca4eddb6613d4f466888eee04f16eedb85"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63c38f45d8f2a2ec0f3a20073cccb335b9f99f73b3c69483cd52ebc75369d8a1"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dd26e3afe8a7b61422df3176e06664503d3f5973b94f45d5c45987e1cb711876"},
-    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:eddd5783a4a6309ce23432353cdb36220e25cbb779bfa9122320666508b44b88"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:56ae39d4036b753994476a1b935584071093b55c7a72e3b8288e68c313ca26e7"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f9939ca7e58c2758c01b40324a59c034ce0cebad18e0d4563a9b1beab3018243"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6c330c0eb815d212893c67a032e9dc1b38a803eccb32f3e8172c19cc69fbb439"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec11802450a2487cdf0e634b750a04cbdc1c4d066b97d94ce7dd2cb51ebb325b"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2b718f316b596f36e1dae097a7d5b91fc5b85e90bf08b01ff139bd8953b25af"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:ed69af290c2b65169f0ba9034d1dc39a5db9459b32f1dd8b5f3f32a3fcf06eab"},
-    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f8a9c828277133af13f3859d1b6bf1c3cb6e9e1637df0e45312e6b7c2e622b1f"},
-    {file = "tokenizers-0.19.1.tar.gz", hash = "sha256:ee59e6680ed0fdbe6b724cf38bd70400a0c1dd623b07ac729087270caeac88e3"},
+    {file = "tokenizers-0.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:439261da7c0a5c88bda97acb284d49fbdaf67e9d3b623c0bfd107512d22787a9"},
+    {file = "tokenizers-0.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03dae629d99068b1ea5416d50de0fea13008f04129cc79af77a2a6392792d93c"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b61f561f329ffe4b28367798b89d60c4abf3f815d37413b6352bc6412a359867"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec870fce1ee5248a10be69f7a8408a234d6f2109f8ea827b4f7ecdbf08c9fd15"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d388d1ea8b7447da784e32e3b86a75cce55887e3b22b31c19d0b186b1c677800"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:299c85c1d21135bc01542237979bf25c32efa0d66595dd0069ae259b97fb2dbe"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e96f6c14c9752bb82145636b614d5a78e9cde95edfbe0a85dad0dd5ddd6ec95c"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc9e95ad49c932b80abfbfeaf63b155761e695ad9f8a58c52a47d962d76e310f"},
+    {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f22dee205329a636148c325921c73cf3e412e87d31f4d9c3153b302a0200057b"},
+    {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2ffd9a8895575ac636d44500c66dffaef133823b6b25067604fa73bbc5ec09d"},
+    {file = "tokenizers-0.20.1-cp310-none-win32.whl", hash = "sha256:2847843c53f445e0f19ea842a4e48b89dd0db4e62ba6e1e47a2749d6ec11f50d"},
+    {file = "tokenizers-0.20.1-cp310-none-win_amd64.whl", hash = "sha256:f9aa93eacd865f2798b9e62f7ce4533cfff4f5fbd50c02926a78e81c74e432cd"},
+    {file = "tokenizers-0.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4a717dcb08f2dabbf27ae4b6b20cbbb2ad7ed78ce05a829fae100ff4b3c7ff15"},
+    {file = "tokenizers-0.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f84dad1ff1863c648d80628b1b55353d16303431283e4efbb6ab1af56a75832"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:929c8f3afa16a5130a81ab5079c589226273ec618949cce79b46d96e59a84f61"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d10766473954397e2d370f215ebed1cc46dcf6fd3906a2a116aa1d6219bfedc3"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9300fac73ddc7e4b0330acbdda4efaabf74929a4a61e119a32a181f534a11b47"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ecaf7b0e39caeb1aa6dd6e0975c405716c82c1312b55ac4f716ef563a906969"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5170be9ec942f3d1d317817ced8d749b3e1202670865e4fd465e35d8c259de83"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f1ae08fa9aea5891cbd69df29913e11d3841798e0bfb1ff78b78e4e7ea0a4"},
+    {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ee86d4095d3542d73579e953c2e5e07d9321af2ffea6ecc097d16d538a2dea16"},
+    {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:86dcd08da163912e17b27bbaba5efdc71b4fbffb841530fdb74c5707f3c49216"},
+    {file = "tokenizers-0.20.1-cp311-none-win32.whl", hash = "sha256:9af2dc4ee97d037bc6b05fa4429ddc87532c706316c5e11ce2f0596dfcfa77af"},
+    {file = "tokenizers-0.20.1-cp311-none-win_amd64.whl", hash = "sha256:899152a78b095559c287b4c6d0099469573bb2055347bb8154db106651296f39"},
+    {file = "tokenizers-0.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:407ab666b38e02228fa785e81f7cf79ef929f104bcccf68a64525a54a93ceac9"},
+    {file = "tokenizers-0.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f13a2d16032ebc8bd812eb8099b035ac65887d8f0c207261472803b9633cf3e"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e98eee4dca22849fbb56a80acaa899eec5b72055d79637dd6aa15d5e4b8628c9"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47c1bcdd61e61136087459cb9e0b069ff23b5568b008265e5cbc927eae3387ce"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128c1110e950534426e2274837fc06b118ab5f2fa61c3436e60e0aada0ccfd67"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2e2d47a819d2954f2c1cd0ad51bb58ffac6f53a872d5d82d65d79bf76b9896d"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bdd67a0e3503a9a7cf8bc5a4a49cdde5fa5bada09a51e4c7e1c73900297539bd"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689b93d2e26d04da337ac407acec8b5d081d8d135e3e5066a88edd5bdb5aff89"},
+    {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0c6a796ddcd9a19ad13cf146997cd5895a421fe6aec8fd970d69f9117bddb45c"},
+    {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3ea919687aa7001a8ff1ba36ac64f165c4e89035f57998fa6cedcfd877be619d"},
+    {file = "tokenizers-0.20.1-cp312-none-win32.whl", hash = "sha256:6d3ac5c1f48358ffe20086bf065e843c0d0a9fce0d7f0f45d5f2f9fba3609ca5"},
+    {file = "tokenizers-0.20.1-cp312-none-win_amd64.whl", hash = "sha256:b0874481aea54a178f2bccc45aa2d0c99cd3f79143a0948af6a9a21dcc49173b"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:96af92e833bd44760fb17f23f402e07a66339c1dcbe17d79a9b55bb0cc4f038e"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:65f34e5b731a262dfa562820818533c38ce32a45864437f3d9c82f26c139ca7f"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17f98fccb5c12ab1ce1f471731a9cd86df5d4bd2cf2880c5a66b229802d96145"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8c0fc3542cf9370bf92c932eb71bdeb33d2d4aeeb4126d9fd567b60bd04cb30"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b39356df4575d37f9b187bb623aab5abb7b62c8cb702867a1768002f814800c"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfdad27b0e50544f6b838895a373db6114b85112ba5c0cefadffa78d6daae563"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:094663dd0e85ee2e573126918747bdb40044a848fde388efb5b09d57bc74c680"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e4cf033a2aa207d7ac790e91adca598b679999710a632c4a494aab0fc3a1b2"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9310951c92c9fb91660de0c19a923c432f110dbfad1a2d429fbc44fa956bf64f"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05e41e302c315bd2ed86c02e917bf03a6cf7d2f652c9cee1a0eb0d0f1ca0d32c"},
+    {file = "tokenizers-0.20.1-cp37-none-win32.whl", hash = "sha256:212231ab7dfcdc879baf4892ca87c726259fa7c887e1688e3f3cead384d8c305"},
+    {file = "tokenizers-0.20.1-cp37-none-win_amd64.whl", hash = "sha256:896195eb9dfdc85c8c052e29947169c1fcbe75a254c4b5792cdbd451587bce85"},
+    {file = "tokenizers-0.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:741fb22788482d09d68e73ece1495cfc6d9b29a06c37b3df90564a9cfa688e6d"},
+    {file = "tokenizers-0.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:10be14ebd8082086a342d969e17fc2d6edc856c59dbdbddd25f158fa40eaf043"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:514cf279b22fa1ae0bc08e143458c74ad3b56cd078b319464959685a35c53d5e"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a647c5b7cb896d6430cf3e01b4e9a2d77f719c84cefcef825d404830c2071da2"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cdf379219e1e1dd432091058dab325a2e6235ebb23e0aec8d0508567c90cd01"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ba72260449e16c4c2f6f3252823b059fbf2d31b32617e582003f2b18b415c39"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:910b96ed87316e4277b23c7bcaf667ce849c7cc379a453fa179e7e09290eeb25"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e53975a6694428a0586534cc1354b2408d4e010a3103117f617cbb550299797c"},
+    {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:07c4b7be58da142b0730cc4e5fd66bb7bf6f57f4986ddda73833cd39efef8a01"},
+    {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b605c540753e62199bf15cf69c333e934077ef2350262af2ccada46026f83d1c"},
+    {file = "tokenizers-0.20.1-cp38-none-win32.whl", hash = "sha256:88b3bc76ab4db1ab95ead623d49c95205411e26302cf9f74203e762ac7e85685"},
+    {file = "tokenizers-0.20.1-cp38-none-win_amd64.whl", hash = "sha256:d412a74cf5b3f68a90c615611a5aa4478bb303d1c65961d22db45001df68afcb"},
+    {file = "tokenizers-0.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a25dcb2f41a0a6aac31999e6c96a75e9152fa0127af8ece46c2f784f23b8197a"},
+    {file = "tokenizers-0.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a12c3cebb8c92e9c35a23ab10d3852aee522f385c28d0b4fe48c0b7527d59762"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02e18da58cf115b7c40de973609c35bde95856012ba42a41ee919c77935af251"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f326a1ac51ae909b9760e34671c26cd0dfe15662f447302a9d5bb2d872bab8ab"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b4872647ea6f25224e2833b044b0b19084e39400e8ead3cfe751238b0802140"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce6238a3311bb8e4c15b12600927d35c267b92a52c881ef5717a900ca14793f7"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57b7a8880b208866508b06ce365dc631e7a2472a3faa24daa430d046fb56c885"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a908c69c2897a68f412aa05ba38bfa87a02980df70f5a72fa8490479308b1f2d"},
+    {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:da1001aa46f4490099c82e2facc4fbc06a6a32bf7de3918ba798010954b775e0"},
+    {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:42c097390e2f0ed0a5c5d569e6669dd4e9fff7b31c6a5ce6e9c66a61687197de"},
+    {file = "tokenizers-0.20.1-cp39-none-win32.whl", hash = "sha256:3d4d218573a3d8b121a1f8c801029d70444ffb6d8f129d4cca1c7b672ee4a24c"},
+    {file = "tokenizers-0.20.1-cp39-none-win_amd64.whl", hash = "sha256:37d1e6f616c84fceefa7c6484a01df05caf1e207669121c66213cb5b2911d653"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48689da7a395df41114f516208d6550e3e905e1239cc5ad386686d9358e9cef0"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:712f90ea33f9bd2586b4a90d697c26d56d0a22fd3c91104c5858c4b5b6489a79"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:359eceb6a620c965988fc559cebc0a98db26713758ec4df43fb76d41486a8ed5"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d3caf244ce89d24c87545aafc3448be15870096e796c703a0d68547187192e1"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03b03cf8b9a32254b1bf8a305fb95c6daf1baae0c1f93b27f2b08c9759f41dee"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:218e5a3561561ea0f0ef1559c6d95b825308dbec23fb55b70b92589e7ff2e1e8"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f40df5e0294a95131cc5f0e0eb91fe86d88837abfbee46b9b3610b09860195a7"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:08aaa0d72bb65058e8c4b0455f61b840b156c557e2aca57627056624c3a93976"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:998700177b45f70afeb206ad22c08d9e5f3a80639dae1032bf41e8cbc4dada4b"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62f7fbd3c2c38b179556d879edae442b45f68312019c3a6013e56c3947a4e648"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31e87fca4f6bbf5cc67481b562147fe932f73d5602734de7dd18a8f2eee9c6dd"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:956f21d359ae29dd51ca5726d2c9a44ffafa041c623f5aa33749da87cfa809b9"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1fbbaf17a393c78d8aedb6a334097c91cb4119a9ced4764ab8cfdc8d254dc9f9"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ebe63e31f9c1a970c53866d814e35ec2ec26fda03097c486f82f3891cee60830"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:81970b80b8ac126910295f8aab2d7ef962009ea39e0d86d304769493f69aaa1e"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130e35e76f9337ed6c31be386e75d4925ea807055acf18ca1a9b0eec03d8fe23"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd28a8614f5c82a54ab2463554e84ad79526c5184cf4573bbac2efbbbcead457"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9041ee665d0fa7f5c4ccf0f81f5e6b7087f797f85b143c094126fc2611fec9d0"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:62eb9daea2a2c06bcd8113a5824af8ef8ee7405d3a71123ba4d52c79bb3d9f1a"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f861889707b54a9ab1204030b65fd6c22bdd4a95205deec7994dc22a8baa2ea4"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:89d5c337d74ea6e5e7dc8af124cf177be843bbb9ca6e58c01f75ea103c12c8a9"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:0b7f515c83397e73292accdbbbedc62264e070bae9682f06061e2ddce67cacaf"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0305fc1ec6b1e5052d30d9c1d5c807081a7bd0cae46a33d03117082e91908c"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dc611e6ac0fa00a41de19c3bf6391a05ea201d2d22b757d63f5491ec0e67faa"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5ffe0d7f7bfcfa3b2585776ecf11da2e01c317027c8573c78ebcb8985279e23"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e7edb8ec12c100d5458d15b1e47c0eb30ad606a05641f19af7563bc3d1608c14"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:de291633fb9303555793cc544d4a86e858da529b7d0b752bcaf721ae1d74b2c9"},
+    {file = "tokenizers-0.20.1.tar.gz", hash = "sha256:84edcc7cdeeee45ceedb65d518fffb77aec69311c9c8e30f77ad84da3025f002"},
 ]
 
 [package.dependencies]
@@ -2953,78 +3383,76 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
 
 [[package]]
 name = "tomli"
-version = "2.0.1"
+version = "2.0.2"
 description = "A lil' TOML parser"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
-    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+    {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"},
+    {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"},
 ]
 
 [[package]]
 name = "torch"
-version = "2.3.0"
+version = "2.5.0"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = true
 python-versions = ">=3.8.0"
 files = [
-    {file = "torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:d8ea5a465dbfd8501f33c937d1f693176c9aef9d1c1b0ca1d44ed7b0a18c52ac"},
-    {file = "torch-2.3.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:09c81c5859a5b819956c6925a405ef1cdda393c9d8a01ce3851453f699d3358c"},
-    {file = "torch-2.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:1bf023aa20902586f614f7682fedfa463e773e26c58820b74158a72470259459"},
-    {file = "torch-2.3.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:758ef938de87a2653bba74b91f703458c15569f1562bf4b6c63c62d9c5a0c1f5"},
-    {file = "torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:493d54ee2f9df100b5ce1d18c96dbb8d14908721f76351e908c9d2622773a788"},
-    {file = "torch-2.3.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:bce43af735c3da16cc14c7de2be7ad038e2fbf75654c2e274e575c6c05772ace"},
-    {file = "torch-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:729804e97b7cf19ae9ab4181f91f5e612af07956f35c8b2c8e9d9f3596a8e877"},
-    {file = "torch-2.3.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:d24e328226d8e2af7cf80fcb1d2f1d108e0de32777fab4aaa2b37b9765d8be73"},
-    {file = "torch-2.3.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:b0de2bdc0486ea7b14fc47ff805172df44e421a7318b7c4d92ef589a75d27410"},
-    {file = "torch-2.3.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:a306c87a3eead1ed47457822c01dfbd459fe2920f2d38cbdf90de18f23f72542"},
-    {file = "torch-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9b98bf1a3c8af2d4c41f0bf1433920900896c446d1ddc128290ff146d1eb4bd"},
-    {file = "torch-2.3.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:dca986214267b34065a79000cee54232e62b41dff1ec2cab9abc3fc8b3dee0ad"},
-    {file = "torch-2.3.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:20572f426965dd8a04e92a473d7e445fa579e09943cc0354f3e6fef6130ce061"},
-    {file = "torch-2.3.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e65ba85ae292909cde0dde6369826d51165a3fc8823dc1854cd9432d7f79b932"},
-    {file = "torch-2.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:5515503a193781fd1b3f5c474e89c9dfa2faaa782b2795cc4a7ab7e67de923f6"},
-    {file = "torch-2.3.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:6ae9f64b09516baa4ef890af0672dc981c20b1f0d829ce115d4420a247e88fba"},
-    {file = "torch-2.3.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:cd0dc498b961ab19cb3f8dbf0c6c50e244f2f37dbfa05754ab44ea057c944ef9"},
-    {file = "torch-2.3.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e05f836559251e4096f3786ee99f4a8cbe67bc7fbedba8ad5e799681e47c5e80"},
-    {file = "torch-2.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:4fb27b35dbb32303c2927da86e27b54a92209ddfb7234afb1949ea2b3effffea"},
-    {file = "torch-2.3.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:760f8bedff506ce9e6e103498f9b1e9e15809e008368594c3a66bf74a8a51380"},
+    {file = "torch-2.5.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:7f179373a047b947dec448243f4e6598a1c960fa3bb978a9a7eecd529fbc363f"},
+    {file = "torch-2.5.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:15fbc95e38d330e5b0ef1593b7bc0a19f30e5bdad76895a5cffa1a6a044235e9"},
+    {file = "torch-2.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:f499212f1cffea5d587e5f06144630ed9aa9c399bba12ec8905798d833bd1404"},
+    {file = "torch-2.5.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:c54db1fade17287aabbeed685d8e8ab3a56fea9dd8d46e71ced2da367f09a49f"},
+    {file = "torch-2.5.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:499a68a756d3b30d10f7e0f6214dc3767b130b797265db3b1c02e9094e2a07be"},
+    {file = "torch-2.5.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:9f3df8138a1126a851440b7d5a4869bfb7c9cc43563d64fd9d96d0465b581024"},
+    {file = "torch-2.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b81da3bdb58c9de29d0e1361e52f12fcf10a89673f17a11a5c6c7da1cb1a8376"},
+    {file = "torch-2.5.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ba135923295d564355326dc409b6b7f5bd6edc80f764cdaef1fb0a1b23ff2f9c"},
+    {file = "torch-2.5.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2dd40c885a05ef7fe29356cca81be1435a893096ceb984441d6e2c27aff8c6f4"},
+    {file = "torch-2.5.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:bc52d603d87fe1da24439c0d5fdbbb14e0ae4874451d53f0120ffb1f6c192727"},
+    {file = "torch-2.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:ea718746469246cc63b3353afd75698a288344adb55e29b7f814a5d3c0a7c78d"},
+    {file = "torch-2.5.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6de1fd253e27e7f01f05cd7c37929ae521ca23ca4620cfc7c485299941679112"},
+    {file = "torch-2.5.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:83dcf518685db20912b71fc49cbddcc8849438cdb0e9dcc919b02a849e2cd9e8"},
+    {file = "torch-2.5.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:65e0a60894435608334d68c8811e55fd8f73e5bf8ee6f9ccedb0064486a7b418"},
+    {file = "torch-2.5.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:38c21ff1bd39f076d72ab06e3c88c2ea6874f2e6f235c9450816b6c8e7627094"},
+    {file = "torch-2.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:ce4baeba9804da5a346e210b3b70826f5811330c343e4fe1582200359ee77fe5"},
+    {file = "torch-2.5.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:03e53f577a96e4d41aca472da8faa40e55df89d2273664af390ce1f570e885bd"},
 ]
 
 [package.dependencies]
 filelock = "*"
 fsspec = "*"
 jinja2 = "*"
-mkl = {version = ">=2021.1.1,<=2021.4.0", markers = "platform_system == \"Windows\""}
 networkx = "*"
-nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cudnn-cu12 = {version = "8.9.2.26", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-sympy = "*"
-triton = {version = "2.3.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.12\""}
+nvidia-cublas-cu12 = {version = "12.4.5.8", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-cupti-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-nvrtc-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-runtime-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+setuptools = {version = "*", markers = "python_version >= \"3.12\""}
+sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
+triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
 typing-extensions = ">=4.8.0"
 
 [package.extras]
 opt-einsum = ["opt-einsum (>=3.3)"]
-optree = ["optree (>=0.9.1)"]
+optree = ["optree (>=0.12.0)"]
 
 [[package]]
 name = "tqdm"
-version = "4.66.4"
+version = "4.66.5"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tqdm-4.66.4-py3-none-any.whl", hash = "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644"},
-    {file = "tqdm-4.66.4.tar.gz", hash = "sha256:e4d936c9de8727928f3be6079590e97d9abfe8d39a590be678eb5919ffc186bb"},
+    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
+    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
 ]
 
 [package.dependencies]
@@ -3038,38 +3466,39 @@ telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.41.2"
+version = "4.45.2"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "transformers-4.41.2-py3-none-any.whl", hash = "sha256:05555d20e43f808de1ef211ab64803cdb513170cef70d29a888b589caebefc67"},
-    {file = "transformers-4.41.2.tar.gz", hash = "sha256:80a4db216533d573e9cc7388646c31ed9480918feb7c55eb211249cb23567f87"},
+    {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"},
+    {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"},
 ]
 
 [package.dependencies]
 filelock = "*"
-huggingface-hub = ">=0.23.0,<1.0"
+huggingface-hub = ">=0.23.2,<1.0"
 numpy = ">=1.17"
 packaging = ">=20.0"
 pyyaml = ">=5.1"
 regex = "!=2019.12.17"
 requests = "*"
 safetensors = ">=0.4.1"
-tokenizers = ">=0.19,<0.20"
+tokenizers = ">=0.20,<0.21"
 tqdm = ">=4.27"
 
 [package.extras]
-accelerate = ["accelerate (>=0.21.0)"]
-agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision"]
+accelerate = ["accelerate (>=0.26.0)"]
+agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+benchmark = ["optimum-benchmark (>=0.3.0)"]
 codecarbon = ["codecarbon (==1.2.0)"]
-deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"]
-deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.19,<0.20)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.20,<0.21)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
 flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 ftfy = ["ftfy"]
@@ -3080,41 +3509,42 @@ natten = ["natten (>=0.14.6,<0.15.0)"]
 onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
 onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
 optuna = ["optuna"]
-quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"]
+quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "libcst", "rich", "ruff (==0.5.1)", "urllib3 (<2.0.0)"]
 ray = ["ray[tune] (>=2.7.0)"]
 retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
+ruff = ["ruff (==0.5.1)"]
 sagemaker = ["sagemaker (>=2.31.0)"]
 sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
 serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
 sigopt = ["sigopt"]
 sklearn = ["scikit-learn"]
 speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
-tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
 tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
-timm = ["timm"]
-tokenizers = ["tokenizers (>=0.19,<0.20)"]
-torch = ["accelerate (>=0.21.0)", "torch"]
+tiktoken = ["blobfile", "tiktoken"]
+timm = ["timm (<=0.9.16)"]
+tokenizers = ["tokenizers (>=0.20,<0.21)"]
+torch = ["accelerate (>=0.26.0)", "torch"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.23.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.19,<0.20)", "torch", "tqdm (>=4.27)"]
+torchhub = ["filelock", "huggingface-hub (>=0.23.2,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.20,<0.21)", "torch", "tqdm (>=4.27)"]
 video = ["av (==9.2.0)", "decord (==0.6.0)"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]
 
 [[package]]
 name = "triton"
-version = "2.3.0"
+version = "3.1.0"
 description = "A language and compiler for custom Deep Learning operations"
 optional = true
 python-versions = "*"
 files = [
-    {file = "triton-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ce4b8ff70c48e47274c66f269cce8861cf1dc347ceeb7a67414ca151b1822d8"},
-    {file = "triton-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c3d9607f85103afdb279938fc1dd2a66e4f5999a58eb48a346bd42738f986dd"},
-    {file = "triton-2.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:218d742e67480d9581bafb73ed598416cc8a56f6316152e5562ee65e33de01c0"},
-    {file = "triton-2.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381ec6b3dac06922d3e4099cfc943ef032893b25415de295e82b1a82b0359d2c"},
-    {file = "triton-2.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:038e06a09c06a164fef9c48de3af1e13a63dc1ba3c792871e61a8e79720ea440"},
-    {file = "triton-2.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d8f636e0341ac348899a47a057c3daea99ea7db31528a225a3ba4ded28ccc65"},
+    {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"},
+    {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"},
+    {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"},
+    {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"},
+    {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"},
 ]
 
 [package.dependencies]
@@ -3122,8 +3552,8 @@ filelock = "*"
 
 [package.extras]
 build = ["cmake (>=3.20)", "lit"]
-tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)", "torch"]
-tutorials = ["matplotlib", "pandas", "tabulate", "torch"]
+tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
+tutorials = ["matplotlib", "pandas", "tabulate"]
 
 [[package]]
 name = "typer"
@@ -3147,35 +3577,35 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.
 
 [[package]]
 name = "typing-extensions"
-version = "4.12.1"
+version = "4.12.2"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "typing_extensions-4.12.1-py3-none-any.whl", hash = "sha256:6024b58b69089e5a89c347397254e35f1bf02a907728ec7fee9bf0fe837d203a"},
-    {file = "typing_extensions-4.12.1.tar.gz", hash = "sha256:915f5e35ff76f56588223f15fdd5938f9a1cf9195c0de25130c627e4d597f6d1"},
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
 ]
 
 [[package]]
 name = "tzdata"
-version = "2024.1"
+version = "2024.2"
 description = "Provider of IANA time zone data"
 optional = true
 python-versions = ">=2"
 files = [
-    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
-    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
+    {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"},
+    {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"},
 ]
 
 [[package]]
 name = "urllib3"
-version = "2.2.1"
+version = "2.2.3"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
-    {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
+    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
+    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
 ]
 
 [package.extras]
@@ -3279,242 +3709,256 @@ files = [
 
 [[package]]
 name = "xxhash"
-version = "3.4.1"
+version = "3.5.0"
 description = "Python binding for xxHash"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "xxhash-3.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:91dbfa55346ad3e18e738742236554531a621042e419b70ad8f3c1d9c7a16e7f"},
-    {file = "xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:665a65c2a48a72068fcc4d21721510df5f51f1142541c890491afc80451636d2"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb11628470a6004dc71a09fe90c2f459ff03d611376c1debeec2d648f44cb693"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bef2a7dc7b4f4beb45a1edbba9b9194c60a43a89598a87f1a0226d183764189"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0f7b2d547d72c7eda7aa817acf8791f0146b12b9eba1d4432c531fb0352228"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00f2fdef6b41c9db3d2fc0e7f94cb3db86693e5c45d6de09625caad9a469635b"},
-    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23cfd9ca09acaf07a43e5a695143d9a21bf00f5b49b15c07d5388cadf1f9ce11"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6a9ff50a3cf88355ca4731682c168049af1ca222d1d2925ef7119c1a78e95b3b"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f1d7c69a1e9ca5faa75546fdd267f214f63f52f12692f9b3a2f6467c9e67d5e7"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:672b273040d5d5a6864a36287f3514efcd1d4b1b6a7480f294c4b1d1ee1b8de0"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4178f78d70e88f1c4a89ff1ffe9f43147185930bb962ee3979dba15f2b1cc799"},
-    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9804b9eb254d4b8cc83ab5a2002128f7d631dd427aa873c8727dba7f1f0d1c2b"},
-    {file = "xxhash-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c09c49473212d9c87261d22c74370457cfff5db2ddfc7fd1e35c80c31a8c14ce"},
-    {file = "xxhash-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:ebbb1616435b4a194ce3466d7247df23499475c7ed4eb2681a1fa42ff766aff6"},
-    {file = "xxhash-3.4.1-cp310-cp310-win_arm64.whl", hash = "sha256:25dc66be3db54f8a2d136f695b00cfe88018e59ccff0f3b8f545869f376a8a46"},
-    {file = "xxhash-3.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58c49083801885273e262c0f5bbeac23e520564b8357fbb18fb94ff09d3d3ea5"},
-    {file = "xxhash-3.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b526015a973bfbe81e804a586b703f163861da36d186627e27524f5427b0d520"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36ad4457644c91a966f6fe137d7467636bdc51a6ce10a1d04f365c70d6a16d7e"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:248d3e83d119770f96003271fe41e049dd4ae52da2feb8f832b7a20e791d2920"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2070b6d5bbef5ee031666cf21d4953c16e92c2f8a24a94b5c240f8995ba3b1d0"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2746035f518f0410915e247877f7df43ef3372bf36cfa52cc4bc33e85242641"},
-    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ba6181514681c2591840d5632fcf7356ab287d4aff1c8dea20f3c78097088"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0aac5010869240e95f740de43cd6a05eae180c59edd182ad93bf12ee289484fa"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4cb11d8debab1626181633d184b2372aaa09825bde709bf927704ed72765bed1"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b29728cff2c12f3d9f1d940528ee83918d803c0567866e062683f300d1d2eff3"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:a15cbf3a9c40672523bdb6ea97ff74b443406ba0ab9bca10ceccd9546414bd84"},
-    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e66df260fed01ed8ea790c2913271641c58481e807790d9fca8bfd5a3c13844"},
-    {file = "xxhash-3.4.1-cp311-cp311-win32.whl", hash = "sha256:e867f68a8f381ea12858e6d67378c05359d3a53a888913b5f7d35fbf68939d5f"},
-    {file = "xxhash-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:200a5a3ad9c7c0c02ed1484a1d838b63edcf92ff538770ea07456a3732c577f4"},
-    {file = "xxhash-3.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:1d03f1c0d16d24ea032e99f61c552cb2b77d502e545187338bea461fde253583"},
-    {file = "xxhash-3.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c4bbba9b182697a52bc0c9f8ec0ba1acb914b4937cd4a877ad78a3b3eeabefb3"},
-    {file = "xxhash-3.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9fd28a9da300e64e434cfc96567a8387d9a96e824a9be1452a1e7248b7763b78"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6066d88c9329ab230e18998daec53d819daeee99d003955c8db6fc4971b45ca3"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93805bc3233ad89abf51772f2ed3355097a5dc74e6080de19706fc447da99cd3"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64da57d5ed586ebb2ecdde1e997fa37c27fe32fe61a656b77fabbc58e6fbff6e"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a97322e9a7440bf3c9805cbaac090358b43f650516486746f7fa482672593df"},
-    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbe750d512982ee7d831838a5dee9e9848f3fb440e4734cca3f298228cc957a6"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fd79d4087727daf4d5b8afe594b37d611ab95dc8e29fe1a7517320794837eb7d"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:743612da4071ff9aa4d055f3f111ae5247342931dedb955268954ef7201a71ff"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:b41edaf05734092f24f48c0958b3c6cbaaa5b7e024880692078c6b1f8247e2fc"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:a90356ead70d715fe64c30cd0969072de1860e56b78adf7c69d954b43e29d9fa"},
-    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ac56eebb364e44c85e1d9e9cc5f6031d78a34f0092fea7fc80478139369a8b4a"},
-    {file = "xxhash-3.4.1-cp312-cp312-win32.whl", hash = "sha256:911035345932a153c427107397c1518f8ce456f93c618dd1c5b54ebb22e73747"},
-    {file = "xxhash-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:f31ce76489f8601cc7b8713201ce94b4bd7b7ce90ba3353dccce7e9e1fee71fa"},
-    {file = "xxhash-3.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:b5beb1c6a72fdc7584102f42c4d9df232ee018ddf806e8c90906547dfb43b2da"},
-    {file = "xxhash-3.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6d42b24d1496deb05dee5a24ed510b16de1d6c866c626c2beb11aebf3be278b9"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b685fab18876b14a8f94813fa2ca80cfb5ab6a85d31d5539b7cd749ce9e3624"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419ffe34c17ae2df019a4685e8d3934d46b2e0bbe46221ab40b7e04ed9f11137"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e041ce5714f95251a88670c114b748bca3bf80cc72400e9f23e6d0d59cf2681"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc860d887c5cb2f524899fb8338e1bb3d5789f75fac179101920d9afddef284b"},
-    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:312eba88ffe0a05e332e3a6f9788b73883752be63f8588a6dc1261a3eaaaf2b2"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e01226b6b6a1ffe4e6bd6d08cfcb3ca708b16f02eb06dd44f3c6e53285f03e4f"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9f3025a0d5d8cf406a9313cd0d5789c77433ba2004b1c75439b67678e5136537"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:6d3472fd4afef2a567d5f14411d94060099901cd8ce9788b22b8c6f13c606a93"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:43984c0a92f06cac434ad181f329a1445017c33807b7ae4f033878d860a4b0f2"},
-    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a55e0506fdb09640a82ec4f44171273eeabf6f371a4ec605633adb2837b5d9d5"},
-    {file = "xxhash-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:faec30437919555b039a8bdbaba49c013043e8f76c999670aef146d33e05b3a0"},
-    {file = "xxhash-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:c9e1b646af61f1fc7083bb7b40536be944f1ac67ef5e360bca2d73430186971a"},
-    {file = "xxhash-3.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:961d948b7b1c1b6c08484bbce3d489cdf153e4122c3dfb07c2039621243d8795"},
-    {file = "xxhash-3.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:719a378930504ab159f7b8e20fa2aa1896cde050011af838af7e7e3518dd82de"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74fb5cb9406ccd7c4dd917f16630d2e5e8cbbb02fc2fca4e559b2a47a64f4940"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5dab508ac39e0ab988039bc7f962c6ad021acd81fd29145962b068df4148c476"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c59f3e46e7daf4c589e8e853d700ef6607afa037bfad32c390175da28127e8c"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc07256eff0795e0f642df74ad096f8c5d23fe66bc138b83970b50fc7f7f6c5"},
-    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9f749999ed80f3955a4af0eb18bb43993f04939350b07b8dd2f44edc98ffee9"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7688d7c02149a90a3d46d55b341ab7ad1b4a3f767be2357e211b4e893efbaaf6"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a8b4977963926f60b0d4f830941c864bed16aa151206c01ad5c531636da5708e"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8106d88da330f6535a58a8195aa463ef5281a9aa23b04af1848ff715c4398fb4"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4c76a77dbd169450b61c06fd2d5d436189fc8ab7c1571d39265d4822da16df22"},
-    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:11f11357c86d83e53719c592021fd524efa9cf024dc7cb1dfb57bbbd0d8713f2"},
-    {file = "xxhash-3.4.1-cp38-cp38-win32.whl", hash = "sha256:0c786a6cd74e8765c6809892a0d45886e7c3dc54de4985b4a5eb8b630f3b8e3b"},
-    {file = "xxhash-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:aabf37fb8fa27430d50507deeab2ee7b1bcce89910dd10657c38e71fee835594"},
-    {file = "xxhash-3.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6127813abc1477f3a83529b6bbcfeddc23162cece76fa69aee8f6a8a97720562"},
-    {file = "xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef2e194262f5db16075caea7b3f7f49392242c688412f386d3c7b07c7733a70a"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71be94265b6c6590f0018bbf73759d21a41c6bda20409782d8117e76cd0dfa8b"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10e0a619cdd1c0980e25eb04e30fe96cf8f4324758fa497080af9c21a6de573f"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa122124d2e3bd36581dd78c0efa5f429f5220313479fb1072858188bc2d5ff1"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17032f5a4fea0a074717fe33477cb5ee723a5f428de7563e75af64bfc1b1e10"},
-    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca7783b20e3e4f3f52f093538895863f21d18598f9a48211ad757680c3bd006f"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d77d09a1113899fad5f354a1eb4f0a9afcf58cefff51082c8ad643ff890e30cf"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:21287bcdd299fdc3328cc0fbbdeaa46838a1c05391264e51ddb38a3f5b09611f"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:dfd7a6cc483e20b4ad90224aeb589e64ec0f31e5610ab9957ff4314270b2bf31"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:543c7fcbc02bbb4840ea9915134e14dc3dc15cbd5a30873a7a5bf66039db97ec"},
-    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fe0a98d990e433013f41827b62be9ab43e3cf18e08b1483fcc343bda0d691182"},
-    {file = "xxhash-3.4.1-cp39-cp39-win32.whl", hash = "sha256:b9097af00ebf429cc7c0e7d2fdf28384e4e2e91008130ccda8d5ae653db71e54"},
-    {file = "xxhash-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:d699b921af0dcde50ab18be76c0d832f803034d80470703700cb7df0fbec2832"},
-    {file = "xxhash-3.4.1-cp39-cp39-win_arm64.whl", hash = "sha256:2be491723405e15cc099ade1280133ccfbf6322d2ef568494fb7d07d280e7eee"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:431625fad7ab5649368c4849d2b49a83dc711b1f20e1f7f04955aab86cd307bc"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc6dbd5fc3c9886a9e041848508b7fb65fd82f94cc793253990f81617b61fe49"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ff8dbd0ec97aec842476cb8ccc3e17dd288cd6ce3c8ef38bff83d6eb927817"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef73a53fe90558a4096e3256752268a8bdc0322f4692ed928b6cd7ce06ad4fe3"},
-    {file = "xxhash-3.4.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:450401f42bbd274b519d3d8dcf3c57166913381a3d2664d6609004685039f9d3"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a162840cf4de8a7cd8720ff3b4417fbc10001eefdd2d21541a8226bb5556e3bb"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b736a2a2728ba45017cb67785e03125a79d246462dfa892d023b827007412c52"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d0ae4c2e7698adef58710d6e7a32ff518b66b98854b1c68e70eee504ad061d8"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6322c4291c3ff174dcd104fae41500e75dad12be6f3085d119c2c8a80956c51"},
-    {file = "xxhash-3.4.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:dd59ed668801c3fae282f8f4edadf6dc7784db6d18139b584b6d9677ddde1b6b"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:92693c487e39523a80474b0394645b393f0ae781d8db3474ccdcead0559ccf45"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4603a0f642a1e8d7f3ba5c4c25509aca6a9c1cc16f85091004a7028607ead663"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa45e8cbfbadb40a920fe9ca40c34b393e0b067082d94006f7f64e70c7490a6"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:595b252943b3552de491ff51e5bb79660f84f033977f88f6ca1605846637b7c6"},
-    {file = "xxhash-3.4.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:562d8b8f783c6af969806aaacf95b6c7b776929ae26c0cd941d54644ea7ef51e"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:41ddeae47cf2828335d8d991f2d2b03b0bdc89289dc64349d712ff8ce59d0647"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c44d584afdf3c4dbb3277e32321d1a7b01d6071c1992524b6543025fb8f4206f"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd7bddb3a5b86213cc3f2c61500c16945a1b80ecd572f3078ddbbe68f9dabdfb"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ecb6c987b62437c2f99c01e97caf8d25660bf541fe79a481d05732e5236719c"},
-    {file = "xxhash-3.4.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:696b4e18b7023527d5c50ed0626ac0520edac45a50ec7cf3fc265cd08b1f4c03"},
-    {file = "xxhash-3.4.1.tar.gz", hash = "sha256:0379d6cf1ff987cd421609a264ce025e74f346e3e145dd106c0cc2e3ec3f99a9"},
+    {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"},
+    {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442"},
+    {file = "xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da"},
+    {file = "xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9"},
+    {file = "xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6"},
+    {file = "xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1"},
+    {file = "xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839"},
+    {file = "xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da"},
+    {file = "xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58"},
+    {file = "xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3"},
+    {file = "xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00"},
+    {file = "xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e"},
+    {file = "xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8"},
+    {file = "xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e"},
+    {file = "xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2"},
+    {file = "xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6"},
+    {file = "xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c"},
+    {file = "xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637"},
+    {file = "xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43"},
+    {file = "xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b"},
+    {file = "xxhash-3.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6e5f70f6dca1d3b09bccb7daf4e087075ff776e3da9ac870f86ca316736bb4aa"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e76e83efc7b443052dd1e585a76201e40b3411fe3da7af4fe434ec51b2f163b"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33eac61d0796ca0591f94548dcfe37bb193671e0c9bcf065789b5792f2eda644"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ec70a89be933ea49222fafc3999987d7899fc676f688dd12252509434636622"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86b8e7f703ec6ff4f351cfdb9f428955859537125904aa8c963604f2e9d3e7"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0adfbd36003d9f86c8c97110039f7539b379f28656a04097e7434d3eaf9aa131"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:63107013578c8a730419adc05608756c3fa640bdc6abe806c3123a49fb829f43"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:683b94dbd1ca67557850b86423318a2e323511648f9f3f7b1840408a02b9a48c"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5d2a01dcce81789cf4b12d478b5464632204f4c834dc2d064902ee27d2d1f0ee"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:a9d360a792cbcce2fe7b66b8d51274ec297c53cbc423401480e53b26161a290d"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:f0b48edbebea1b7421a9c687c304f7b44d0677c46498a046079d445454504737"},
+    {file = "xxhash-3.5.0-cp37-cp37m-win32.whl", hash = "sha256:7ccb800c9418e438b44b060a32adeb8393764da7441eb52aa2aa195448935306"},
+    {file = "xxhash-3.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c3bc7bf8cb8806f8d1c9bf149c18708cb1c406520097d6b0a73977460ea03602"},
+    {file = "xxhash-3.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:74752ecaa544657d88b1d1c94ae68031e364a4d47005a90288f3bab3da3c970f"},
+    {file = "xxhash-3.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dee1316133c9b463aa81aca676bc506d3f80d8f65aeb0bba2b78d0b30c51d7bd"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:602d339548d35a8579c6b013339fb34aee2df9b4e105f985443d2860e4d7ffaa"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:695735deeddfb35da1677dbc16a083445360e37ff46d8ac5c6fcd64917ff9ade"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1030a39ba01b0c519b1a82f80e8802630d16ab95dc3f2b2386a0b5c8ed5cbb10"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5bc08f33c4966f4eb6590d6ff3ceae76151ad744576b5fc6c4ba8edd459fdec"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:160e0c19ee500482ddfb5d5570a0415f565d8ae2b3fd69c5dcfce8a58107b1c3"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f1abffa122452481a61c3551ab3c89d72238e279e517705b8b03847b1d93d738"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:d5e9db7ef3ecbfc0b4733579cea45713a76852b002cf605420b12ef3ef1ec148"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:23241ff6423378a731d84864bf923a41649dc67b144debd1077f02e6249a0d54"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:82b833d5563fefd6fceafb1aed2f3f3ebe19f84760fdd289f8b926731c2e6e91"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0a80ad0ffd78bef9509eee27b4a29e56f5414b87fb01a888353e3d5bda7038bd"},
+    {file = "xxhash-3.5.0-cp38-cp38-win32.whl", hash = "sha256:50ac2184ffb1b999e11e27c7e3e70cc1139047e7ebc1aa95ed12f4269abe98d4"},
+    {file = "xxhash-3.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:392f52ebbb932db566973693de48f15ce787cabd15cf6334e855ed22ea0be5b3"},
+    {file = "xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301"},
+    {file = "xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606"},
+    {file = "xxhash-3.5.0-cp39-cp39-win32.whl", hash = "sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4"},
+    {file = "xxhash-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558"},
+    {file = "xxhash-3.5.0-cp39-cp39-win_arm64.whl", hash = "sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2b4154c00eb22e4d543f472cfca430e7962a0f1d0f3778334f2e08a7ba59363c"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d30bbc1644f726b825b3278764240f449d75f1a8bdda892e641d4a688b1494ae"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa0b72f2423e2aa53077e54a61c28e181d23effeaafd73fcb9c494e60930c8e"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:13de2b76c1835399b2e419a296d5b38dc4855385d9e96916299170085ef72f57"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:0691bfcc4f9c656bcb96cc5db94b4d75980b9d5589f2e59de790091028580837"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:297595fe6138d4da2c8ce9e72a04d73e58725bb60f3a19048bc96ab2ff31c692"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1276d369452040cbb943300dc8abeedab14245ea44056a2943183822513a18"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2061188a1ba352fc699c82bff722f4baacb4b4b8b2f0c745d2001e56d0dfb514"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38c384c434021e4f62b8d9ba0bc9467e14d394893077e2c66d826243025e1f81"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e6a4dd644d72ab316b580a1c120b375890e4c52ec392d4aef3c63361ec4d77d1"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240"},
+    {file = "xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f"},
 ]
 
 [[package]]
 name = "yarl"
-version = "1.9.4"
+version = "1.16.0"
 description = "Yet another URL library"
 optional = true
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 files = [
-    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"},
-    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"},
-    {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"},
-    {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"},
-    {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"},
-    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"},
-    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"},
-    {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"},
-    {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"},
-    {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"},
-    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"},
-    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"},
-    {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"},
-    {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"},
-    {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"},
-    {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"},
-    {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"},
-    {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"},
-    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"},
-    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"},
-    {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"},
-    {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"},
-    {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"},
-    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"},
-    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"},
-    {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"},
-    {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"},
-    {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"},
-    {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"},
-    {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"},
+    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:32468f41242d72b87ab793a86d92f885355bcf35b3355aa650bfa846a5c60058"},
+    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:234f3a3032b505b90e65b5bc6652c2329ea7ea8855d8de61e1642b74b4ee65d2"},
+    {file = "yarl-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a0296040e5cddf074c7f5af4a60f3fc42c0237440df7bcf5183be5f6c802ed5"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de6c14dd7c7c0badba48157474ea1f03ebee991530ba742d381b28d4f314d6f3"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b140e532fe0266003c936d017c1ac301e72ee4a3fd51784574c05f53718a55d8"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:019f5d58093402aa8f6661e60fd82a28746ad6d156f6c5336a70a39bd7b162b9"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c42998fd1cbeb53cd985bff0e4bc25fbe55fd6eb3a545a724c1012d69d5ec84"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c7c30fb38c300fe8140df30a046a01769105e4cf4282567a29b5cdb635b66c4"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e49e0fd86c295e743fd5be69b8b0712f70a686bc79a16e5268386c2defacaade"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:b9ca7b9147eb1365c8bab03c003baa1300599575effad765e0b07dd3501ea9af"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27e11db3f1e6a51081a981509f75617b09810529de508a181319193d320bc5c7"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8994c42f4ca25df5380ddf59f315c518c81df6a68fed5bb0c159c6cb6b92f120"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:542fa8e09a581bcdcbb30607c7224beff3fdfb598c798ccd28a8184ffc18b7eb"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2bd6a51010c7284d191b79d3b56e51a87d8e1c03b0902362945f15c3d50ed46b"},
+    {file = "yarl-1.16.0-cp310-cp310-win32.whl", hash = "sha256:178ccb856e265174a79f59721031060f885aca428983e75c06f78aa24b91d929"},
+    {file = "yarl-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe8bba2545427418efc1929c5c42852bdb4143eb8d0a46b09de88d1fe99258e7"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d8643975a0080f361639787415a038bfc32d29208a4bf6b783ab3075a20b1ef3"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:676d96bafc8c2d0039cea0cd3fd44cee7aa88b8185551a2bb93354668e8315c2"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9525f03269e64310416dbe6c68d3b23e5d34aaa8f47193a1c45ac568cecbc49"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b37d5ec034e668b22cf0ce1074d6c21fd2a08b90d11b1b73139b750a8b0dd97"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f32c4cb7386b41936894685f6e093c8dfaf0960124d91fe0ec29fe439e201d0"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b8e265a0545637492a7e12fd7038370d66c9375a61d88c5567d0e044ded9202"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:789a3423f28a5fff46fbd04e339863c169ece97c827b44de16e1a7a42bc915d2"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1d1f45e3e8d37c804dca99ab3cf4ab3ed2e7a62cd82542924b14c0a4f46d243"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:621280719c4c5dad4c1391160a9b88925bb8b0ff6a7d5af3224643024871675f"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed097b26f18a1f5ff05f661dc36528c5f6735ba4ce8c9645e83b064665131349"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f1fe2b2e3ee418862f5ebc0c0083c97f6f6625781382f828f6d4e9b614eba9b"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:87dd10bc0618991c66cee0cc65fa74a45f4ecb13bceec3c62d78ad2e42b27a16"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4199db024b58a8abb2cfcedac7b1292c3ad421684571aeb622a02f242280e8d6"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:99a9dcd4b71dd5f5f949737ab3f356cfc058c709b4f49833aeffedc2652dac56"},
+    {file = "yarl-1.16.0-cp311-cp311-win32.whl", hash = "sha256:a9394c65ae0ed95679717d391c862dece9afacd8fa311683fc8b4362ce8a410c"},
+    {file = "yarl-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:5b9101f528ae0f8f65ac9d64dda2bb0627de8a50344b2f582779f32fda747c1d"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4ffb7c129707dd76ced0a4a4128ff452cecf0b0e929f2668ea05a371d9e5c104"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1a5e9d8ce1185723419c487758d81ac2bde693711947032cce600ca7c9cda7d6"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d743e3118b2640cef7768ea955378c3536482d95550222f908f392167fe62059"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26768342f256e6e3c37533bf9433f5f15f3e59e3c14b2409098291b3efaceacb"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1b0796168b953bca6600c5f97f5ed407479889a36ad7d17183366260f29a6b9"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:858728086914f3a407aa7979cab743bbda1fe2bdf39ffcd991469a370dd7414d"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5570e6d47bcb03215baf4c9ad7bf7c013e56285d9d35013541f9ac2b372593e7"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66ea8311422a7ba1fc79b4c42c2baa10566469fe5a78500d4e7754d6e6db8724"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:649bddcedee692ee8a9b7b6e38582cb4062dc4253de9711568e5620d8707c2a3"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3a91654adb7643cb21b46f04244c5a315a440dcad63213033826549fa2435f71"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b439cae82034ade094526a8f692b9a2b5ee936452de5e4c5f0f6c48df23f8604"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:571f781ae8ac463ce30bacebfaef2c6581543776d5970b2372fbe31d7bf31a07"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:aa7943f04f36d6cafc0cf53ea89824ac2c37acbdb4b316a654176ab8ffd0f968"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a5cf32539373ff39d97723e39a9283a7277cbf1224f7aef0c56c9598b6486c3"},
+    {file = "yarl-1.16.0-cp312-cp312-win32.whl", hash = "sha256:a5b6c09b9b4253d6a208b0f4a2f9206e511ec68dce9198e0fbec4f160137aa67"},
+    {file = "yarl-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:1208ca14eed2fda324042adf8d6c0adf4a31522fa95e0929027cd487875f0240"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5ace0177520bd4caa99295a9b6fb831d0e9a57d8e0501a22ffaa61b4c024283"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7118bdb5e3ed81acaa2095cba7ec02a0fe74b52a16ab9f9ac8e28e53ee299732"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38fec8a2a94c58bd47c9a50a45d321ab2285ad133adefbbadf3012c054b7e656"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8791d66d81ee45866a7bb15a517b01a2bcf583a18ebf5d72a84e6064c417e64b"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cf936ba67bc6c734f3aa1c01391da74ab7fc046a9f8bbfa230b8393b90cf472"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1aab176dd55b59f77a63b27cffaca67d29987d91a5b615cbead41331e6b7428"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:995d0759004c08abd5d1b81300a91d18c8577c6389300bed1c7c11675105a44d"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bc22e00edeb068f71967ab99081e9406cd56dbed864fc3a8259442999d71552"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:35b4f7842154176523e0a63c9b871168c69b98065d05a4f637fce342a6a2693a"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:7ace71c4b7a0c41f317ae24be62bb61e9d80838d38acb20e70697c625e71f120"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8f639e3f5795a6568aa4f7d2ac6057c757dcd187593679f035adbf12b892bb00"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e8be3aff14f0120ad049121322b107f8a759be76a6a62138322d4c8a337a9e2c"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:122d8e7986043d0549e9eb23c7fd23be078be4b70c9eb42a20052b3d3149c6f2"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fd9c227990f609c165f56b46107d0bc34553fe0387818c42c02f77974402c36"},
+    {file = "yarl-1.16.0-cp313-cp313-win32.whl", hash = "sha256:595ca5e943baed31d56b33b34736461a371c6ea0038d3baec399949dd628560b"},
+    {file = "yarl-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:921b81b8d78f0e60242fb3db615ea3f368827a76af095d5a69f1c3366db3f596"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab2b2ac232110a1fdb0d3ffcd087783edd3d4a6ced432a1bf75caf7b7be70916"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7f8713717a09acbfee7c47bfc5777e685539fefdd34fa72faf504c8be2f3df4e"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cdcffe1dbcb4477d2b4202f63cd972d5baa155ff5a3d9e35801c46a415b7f71a"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a91217208306d82357c67daeef5162a41a28c8352dab7e16daa82e3718852a7"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ab3ed42c78275477ea8e917491365e9a9b69bb615cb46169020bd0aa5e2d6d3"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:707ae579ccb3262dfaef093e202b4c3fb23c3810e8df544b1111bd2401fd7b09"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7a852d1cd0b8d8b37fc9d7f8581152add917a98cfe2ea6e241878795f917ae"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3f1cc3d3d4dc574bebc9b387f6875e228ace5748a7c24f49d8f01ac1bc6c31b"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5ff96da263740779b0893d02b718293cc03400c3a208fc8d8cd79d9b0993e532"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:3d375a19ba2bfe320b6d873f3fb165313b002cef8b7cc0a368ad8b8a57453837"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:62c7da0ad93a07da048b500514ca47b759459ec41924143e2ddb5d7e20fd3db5"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:147b0fcd0ee33b4b5f6edfea80452d80e419e51b9a3f7a96ce98eaee145c1581"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:504e1fe1cc4f170195320eb033d2b0ccf5c6114ce5bf2f617535c01699479bca"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bdcf667a5dec12a48f669e485d70c54189f0639c2157b538a4cffd24a853624f"},
+    {file = "yarl-1.16.0-cp39-cp39-win32.whl", hash = "sha256:e9951afe6557c75a71045148890052cb942689ee4c9ec29f5436240e1fcc73b7"},
+    {file = "yarl-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d7aaa8ff95d0840e289423e7dc35696c2b058d635f945bf05b5cd633146b027"},
+    {file = "yarl-1.16.0-py3-none-any.whl", hash = "sha256:e6980a558d8461230c457218bd6c92dfc1d10205548215c2c21d79dc8d0a96f3"},
+    {file = "yarl-1.16.0.tar.gz", hash = "sha256:b6f687ced5510a9a2474bbae96a4352e5ace5fa34dc44a217b0537fec1db00b4"},
 ]
 
 [package.dependencies]
 idna = ">=2.0"
 multidict = ">=4.0"
+propcache = ">=0.2.0"
 
 [[package]]
 name = "zipp"
-version = "3.19.1"
+version = "3.20.2"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
-    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
+    {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
+    {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
 ]
 
 [package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+cover = ["pytest-cov"]
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
+type = ["pytest-mypy"]
 
 [extras]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
+marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"]
+moe = ["moe-kernels", "moe-kernels", "moe-kernels", "moe-kernels"]
 outlines = ["outlines"]
 peft = ["peft"]
 quantize = ["accelerate", "datasets", "texttable"]
@@ -3523,4 +3967,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "f62a7a74e1e1bcb3b7cb4f7da2b538065830748062a2b57fdbb4c76eae5abddc"
+content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index dda681deb3971d0001ff7ac75944753b6dc6e007..1e94070fe89b0512f07dfabdc9d79976088b6eed 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "2.0.5-dev0"
+version = "2.4.0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
 
@@ -10,9 +10,9 @@ text-generation-server = 'text_generation_server.cli:app'
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
 protobuf = "^4.25.3"
-grpcio = "1.62.2"
-grpcio-status = "1.62.2"
-grpcio-reflection = "1.62.2"
+grpcio = "^1.51.1"
+grpcio-status = "^1.51.1"
+grpcio-reflection = "^1.51.1"
 grpc-interceptor = "^0.15.0"
 typer = "^0.6.1"
 accelerate = { version = "^0.29.1", optional = true }
@@ -23,25 +23,43 @@ opentelemetry-api = "^1.25.0"
 opentelemetry-exporter-otlp = "^1.25.0"
 opentelemetry-instrumentation-grpc = "^0.46b0"
 hf-transfer = "^0.1.2"
-sentencepiece = "^0.1.97"
-tokenizers = "^0.19.1"
+sentencepiece = "^0.2"
+tokenizers = "^0.20"
 huggingface-hub = "^0.23"
-transformers = "^4.41"
+transformers = "^4.45"
 einops = "^0.6.1"
 texttable = { version = "^1.6.7", optional = true }
 datasets = { version = "^2.14.0", optional = true }
 peft = { version = "^0.10", optional = true }
-torch = { version = "^2.3.0", optional = true }
+torch = { version = "^2.4.0", optional = true }
 scipy = "^1.11.1"
 pillow = "^10.0.0"
 outlines= { version = "^0.0.34", optional = true }
 prometheus-client = "^0.20.0"
 py-cpuinfo = "^9.0.0"
+# Remove later, temporary workaround for outlines.
+numpy = "^1.26"
+
+marlin-kernels = [
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+]
+moe-kernels = [
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+]
+rich = "^13.7.1"
 
 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
+marlin = ["marlin-kernels"]
+moe = ["moe-kernels"]
 peft = ["peft"]
 quantize = ["texttable", "datasets", "accelerate"]
 outlines = ["outlines"]
@@ -64,3 +82,6 @@ requires = [
     "poetry-core>=1.0.0",
 ]
 build-backend = "poetry.core.masonry.api"
+
+[tool.isort]
+profile = "black"
diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt
index 88fcc4f363330fe2c2cba1c7c3cb51a67a26970e..e3f6d20f8c77872eb8ab84ea34fd4f6cc5123612 100644
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@@ -1,48 +1,54 @@
-backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
-certifi==2024.2.2 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
+importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
+mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
 numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.0 ; python_version >= "3.9" and python_version < "3.13"
-pillow==10.3.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-common==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.4.0 ; python_version >= "3.9" and python_version < "3.13"
 prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
 py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
+pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
+rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/requirements_intel.txt b/server/requirements_intel.txt
index 5751bf81659fed08dd121acb46f1355fdc191ece..e3f6d20f8c77872eb8ab84ea34fd4f6cc5123612 100644
--- a/server/requirements_intel.txt
+++ b/server/requirements_intel.txt
@@ -1,48 +1,54 @@
-backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
-certifi==2024.2.2 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
+importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
+mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
 numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.0 ; python_version >= "3.9" and python_version < "3.13"
-pillow==10.3.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-common==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.4.0 ; python_version >= "3.9" and python_version < "3.13"
 prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
 py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
+pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
+rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt
index 88fcc4f363330fe2c2cba1c7c3cb51a67a26970e..e3f6d20f8c77872eb8ab84ea34fd4f6cc5123612 100644
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@@ -1,48 +1,54 @@
-backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
-certifi==2024.2.2 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
+importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
+mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
 numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.0 ; python_version >= "3.9" and python_version < "3.13"
-pillow==10.3.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-common==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.25.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.46b0 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.4.0 ; python_version >= "3.9" and python_version < "3.13"
 prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
 py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
+pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
+rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/tests/conftest.py b/server/tests/conftest.py
index 16d2c4081ffc70b4faefdff2c29b2c1df2683ae0..9822279214f22ec5b4ef78d180c4985fd58cf6fc 100644
--- a/server/tests/conftest.py
+++ b/server/tests/conftest.py
@@ -1,7 +1,10 @@
 import pytest
-
+import os
 from text_generation_server.pb import generate_pb2
 
+os.environ["PREFIX_CACHING"] = "1"
+os.environ["ATTENTION"] = "flashinfer"
+
 
 @pytest.fixture
 def default_pb_parameters():
diff --git a/server/tests/models/test_bloom.py b/server/tests/models/test_bloom.py
index 32ee6686b6be1212b8176f4c47a4fa5f7b5ad51f..dff5239b3499354da62959a5a987178b9aa84cee 100644
--- a/server/tests/models/test_bloom.py
+++ b/server/tests/models/test_bloom.py
@@ -8,6 +8,9 @@ from text_generation_server.pb import generate_pb2
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.utils import weight_hub_files, download_weights
 from text_generation_server.models.bloom import BloomCausalLMBatch, BLOOMSharded
+from text_generation_server.models.custom_modeling.bloom_modeling import (
+    BloomForCausalLM,
+)
 
 
 @pytest.fixture(scope="session")
@@ -16,7 +19,10 @@ def default_bloom():
     revision = "main"
     filenames = weight_hub_files(model_id, revision, ".safetensors")
     download_weights(filenames, model_id, revision)
-    return BLOOMSharded(model_id)
+    return BLOOMSharded(
+        model_id,
+        model_class=BloomForCausalLM,
+    )
 
 
 @pytest.fixture(scope="session")
@@ -261,7 +267,7 @@ def test_batch_concatenate(
     assert next_batch.max_input_length == 3
 
     assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == next_batch_1.requests
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
 
     assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
     assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
diff --git a/server/tests/models/test_causal_lm.py b/server/tests/models/test_causal_lm.py
index 6e6463bc9485d9515dcb280efade78cfc3be3b8a..11ca5efe37f00a3670cd8499eddf73da625ba6e8 100644
--- a/server/tests/models/test_causal_lm.py
+++ b/server/tests/models/test_causal_lm.py
@@ -10,7 +10,7 @@ from text_generation_server.models.causal_lm import CausalLM, CausalLMBatch
 
 @pytest.fixture(scope="session")
 def default_causal_lm():
-    return CausalLM("gpt2")
+    return CausalLM.fallback("gpt2")
 
 
 @pytest.fixture(scope="session")
@@ -262,7 +262,7 @@ def test_batch_concatenate(
     assert next_batch.max_input_length == 3
 
     assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == next_batch_1.requests
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
 
     assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
     assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
diff --git a/server/tests/models/test_santacoder.py b/server/tests/models/test_santacoder.py
index cb2622d9b5301acca2c7a36117e8f577b0fbac8b..d5c91bffc849a11a1cfd9b766b6b6761dc6a23f6 100644
--- a/server/tests/models/test_santacoder.py
+++ b/server/tests/models/test_santacoder.py
@@ -1,13 +1,12 @@
 import pytest
 
 from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLMBatch
-from text_generation_server.models.santacoder import SantaCoder
+from text_generation_server.models.causal_lm import CausalLMBatch, CausalLM
 
 
 @pytest.fixture(scope="session")
 def default_santacoder():
-    return SantaCoder("bigcode/santacoder")
+    return CausalLM.fallback(model_id="bigcode/santacoder")
 
 
 @pytest.fixture
diff --git a/server/tests/models/test_seq2seq_lm.py b/server/tests/models/test_seq2seq_lm.py
index 943c3b0820dc12619b794a0322b40702bb08a807..5ba7c64ce38170799fa5bd931c36468d8f704b5b 100644
--- a/server/tests/models/test_seq2seq_lm.py
+++ b/server/tests/models/test_seq2seq_lm.py
@@ -20,7 +20,7 @@ def mt0_small_tokenizer():
 
 @pytest.fixture(scope="session")
 def default_seq2seq_lm():
-    return Seq2SeqLM("bigscience/mt0-small")
+    return Seq2SeqLM.fallback("bigscience/mt0-small")
 
 
 @pytest.fixture
@@ -281,7 +281,7 @@ def test_batch_concatenate(
     assert next_batch.max_decoder_input_length == 3
 
     assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == next_batch_1.requests
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
 
     assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
     assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
diff --git a/server/tests/utils/test_adapter.py b/server/tests/utils/test_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a27c105515256a842890c3ec074ad0bec8bf260f
--- /dev/null
+++ b/server/tests/utils/test_adapter.py
@@ -0,0 +1,247 @@
+import pytest
+from unittest.mock import Mock
+from text_generation_server.utils.adapter import (
+    get_attn_weights,
+    get_mlp_weights,
+    parse_lora_adapters,
+    AdapterInfo,
+)
+
+
+def test_parse_lora_adapters_empty():
+    assert parse_lora_adapters(None) == []
+    assert parse_lora_adapters("") == []
+
+
+def test_parse_lora_adapters_single():
+    result = parse_lora_adapters("adapter1")
+    assert result == [AdapterInfo(id="adapter1", path=None, revision=None)]
+
+
+def test_parse_lora_adapters_with_path():
+    result = parse_lora_adapters("adapter1=path/to/adapter1")
+    assert result == [
+        AdapterInfo(id="adapter1", path="path/to/adapter1", revision=None)
+    ]
+
+
+def test_parse_lora_adapters_with_path_and_revision():
+    result = parse_lora_adapters("adapter1=path/to/adapter1@main")
+    assert result == [
+        AdapterInfo(id="adapter1", path="path/to/adapter1", revision="main")
+    ]
+
+
+def test_parse_lora_adapters_multiple():
+    result = parse_lora_adapters(
+        "adapter1,adapter2=path/to/adapter2,adapter3=path/to/adapter3@dev"
+    )
+    assert result == [
+        AdapterInfo(id="adapter1", path=None, revision=None),
+        AdapterInfo(id="adapter2", path="path/to/adapter2", revision=None),
+        AdapterInfo(id="adapter3", path="path/to/adapter3", revision="dev"),
+    ]
+
+
+def test_parse_lora_adapters_invalid_format():
+    try:
+        parse_lora_adapters("adapter1,invalid=format=test,adapter3")
+        assert False, "Should have raised ValueError"
+    except ValueError as e:
+        assert str(e) == "Invalid LoRA adapter format: invalid=format=test"
+
+
+def test_get_attn_weights():
+    # create a mock layer
+    mock_layer = Mock()
+    mock_layer.self_attn.query_key_value = Mock()
+    mock_layer.self_attn.o_proj = Mock()
+
+    # call the function
+    result = get_attn_weights(2, mock_layer)
+
+    # assert the result
+    expected = {
+        (2, "q_proj"): (
+            "model.layers.2.self_attn.q_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "k_proj"): (
+            "model.layers.2.self_attn.k_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "qkv_proj"): (
+            "model.layers.2.self_attn.qkv_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "v_proj"): (
+            "model.layers.2.self_attn.v_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
+    }
+    assert result == expected
+
+
+def test_get_mlp_weights_with_gate_up_proj():
+    # create a mock layer with gate_up_proj
+    mock_layer = Mock()
+    mock_layer.mlp.gate_up_proj = Mock()
+    mock_layer.mlp.down_proj = Mock()
+
+    # call the function
+    result = get_mlp_weights(3, mock_layer)
+
+    # assert the result
+    expected = {
+        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
+        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
+        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
+    }
+    assert result == expected
+
+
+def test_get_mlp_weights_without_gate_up_proj():
+    # create a mock layer without gate_up_proj
+    mock_layer = Mock()
+    mock_layer.mlp = Mock(spec=[])
+
+    # call the function
+    result = get_mlp_weights(1, mock_layer)
+
+    # assert the result
+    assert result == {}
+
+
+@pytest.mark.parametrize("layer_index", [0, 1, 5])
+def test_get_attn_weights_different_layers(layer_index):
+    mock_layer = Mock()
+    mock_layer.self_attn.query_key_value = Mock()
+    mock_layer.self_attn.o_proj = Mock()
+
+    result = get_attn_weights(layer_index, mock_layer)
+
+    for k in ["q", "k", "v"]:
+        assert (layer_index, f"{k}_proj") in result
+        assert (
+            result[(layer_index, f"{k}_proj")][0]
+            == f"model.layers.{layer_index}.self_attn.{k}_proj"
+        )
+
+    assert (layer_index, "o_proj") in result
+    assert (
+        result[(layer_index, "o_proj")][0]
+        == f"model.layers.{layer_index}.self_attn.o_proj"
+    )
+
+
+@pytest.mark.parametrize("layer_index", [0, 1, 5])
+def test_get_mlp_weights_different_layers(layer_index):
+    mock_layer = Mock()
+    mock_layer.mlp.gate_up_proj = Mock()
+    mock_layer.mlp.down_proj = Mock()
+
+    result = get_mlp_weights(layer_index, mock_layer)
+
+    for k in ["gate", "up", "down"]:
+        assert (layer_index, f"{k}_proj") in result
+        assert (
+            result[(layer_index, f"{k}_proj")][0]
+            == f"model.layers.{layer_index}.mlp.{k}_proj"
+        )
+
+
+def test_get_attn_weights_llama_compatibility():
+    mock_layer = Mock()
+    mock_layer.self_attn.query_key_value = Mock()
+    mock_layer.self_attn.o_proj = Mock()
+
+    result = get_attn_weights(2, mock_layer)
+
+    expected = {
+        (2, "q_proj"): (
+            "model.layers.2.self_attn.q_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "k_proj"): (
+            "model.layers.2.self_attn.k_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "qkv_proj"): (
+            "model.layers.2.self_attn.qkv_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "v_proj"): (
+            "model.layers.2.self_attn.v_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
+    }
+    assert result == expected
+
+
+def test_get_mlp_weights_llama_compatibility():
+    mock_layer = Mock()
+    mock_layer.mlp.gate_up_proj = Mock()
+    mock_layer.mlp.down_proj = Mock()
+
+    result = get_mlp_weights(3, mock_layer)
+
+    expected = {
+        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
+        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
+        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
+    }
+    assert result == expected
+
+
+def test_get_attn_weights_gemma_compatibility():
+    mock_layer = Mock()
+    mock_layer.self_attn.query_key_value = Mock()
+    mock_layer.self_attn.o_proj = Mock()
+
+    result = get_attn_weights(2, mock_layer)
+
+    expected = {
+        (2, "q_proj"): (
+            "model.layers.2.self_attn.q_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "k_proj"): (
+            "model.layers.2.self_attn.k_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "qkv_proj"): (
+            "model.layers.2.self_attn.qkv_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "v_proj"): (
+            "model.layers.2.self_attn.v_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
+    }
+    assert result == expected
+
+
+def test_get_mlp_weights_gemma_compatibility():
+    mock_layer = Mock()
+    mock_layer.mlp.gate_proj = Mock()
+    mock_layer.mlp.up_proj = Mock()
+    mock_layer.mlp.down_proj = Mock()
+
+    # ensure that the mock_layer.mlp.gate_up_proj attribute does not exist.
+    # This is necessary because the use of `Mock` automatically creates any
+    # attributes that are accessed, even if they don't exist in the actual
+    # implementation. If `gate_up_proj` were created, `get_mlp_weights` might
+    # follow the wrong execution path and return an incorrect result.
+    del mock_layer.mlp.gate_up_proj
+
+    result = get_mlp_weights(3, mock_layer)
+
+    expected = {
+        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_proj),
+        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.up_proj),
+        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
+    }
+    assert result == expected
diff --git a/server/tests/utils/test_hub.py b/server/tests/utils/test_hub.py
index 721820f518f0d668239e544181cc6632dbe136aa..291a41b052187c61378496579ad89e9599d5be61 100644
--- a/server/tests/utils/test_hub.py
+++ b/server/tests/utils/test_hub.py
@@ -1,11 +1,9 @@
 import os
-import requests
 import tempfile
 
 import pytest
 
 import huggingface_hub.constants
-from huggingface_hub import hf_api
 
 import text_generation_server.utils.hub
 from text_generation_server.utils.hub import (
diff --git a/server/tests/utils/test_layers.py b/server/tests/utils/test_layers.py
index 9a8da0d62d66f24fcf3ba6b3138300f3ab484945..118540eede54fdf91ed577ec1ac5b4a7e6084a07 100644
--- a/server/tests/utils/test_layers.py
+++ b/server/tests/utils/test_layers.py
@@ -42,7 +42,12 @@ class Weights:
 def test_weight_hub_files_offline_error():
 
     vocab_size = 17
-    weights = Weights(rank=0, world_size=1, vocab_size=vocab_size, hidden_dim=256)
+    weights = Weights(
+        rank=0,
+        world_size=1,
+        vocab_size=vocab_size,
+        hidden_dim=256,
+    )
     embeddings = TensorParallelEmbedding("", weights)
 
     input_ids = torch.arange(vocab_size)
diff --git a/server/tests/utils/test_weights.py b/server/tests/utils/test_weights.py
index 8f88b1f80a13bf30f67f9634e2c4162db9948714..556fcea1e12b54d8ad8bed50c0164abc26c36a4f 100644
--- a/server/tests/utils/test_weights.py
+++ b/server/tests/utils/test_weights.py
@@ -1,13 +1,50 @@
 import pytest
 import torch
-from text_generation_server.utils.weights import Weights
-from text_generation_server.layers.gptq import GPTQWeight
-from text_generation_server.layers.exl2 import Exl2Weight
-from text_generation_server.layers.marlin import MarlinWeight
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    Weights,
+    WeightsLoader,
+)
+from text_generation_server.layers.gptq import GPTQWeight, GPTQWeightsLoader
+from text_generation_server.layers.exl2 import Exl2Weight, Exl2WeightsLoader
+from text_generation_server.layers.marlin.marlin import (
+    MarlinWeight,
+    MarlinWeightsLoader,
+)
 from types import SimpleNamespace
 from typing import List, Optional, Dict, Union
 from pathlib import Path
 
+
+@pytest.fixture
+def gptq_weights_loader():
+    return GPTQWeightsLoader(
+        bits=4,
+        groupsize=-1,
+        desc_act=False,
+        quant_method="gptq",
+        quantize="gptq",
+        sym=True,
+    )
+
+
+@pytest.fixture
+def gptq_weights_loader_awq():
+    return GPTQWeightsLoader(
+        bits=4,
+        groupsize=-1,
+        desc_act=False,
+        quant_method="awq",
+        quantize="awq",
+        sym=True,
+    )
+
+
+@pytest.fixture
+def marlin_weights_loader():
+    return MarlinWeightsLoader(bits=4, is_marlin_24=False)
+
+
 dummy_file_system = {
     "test_weights": {
         "layer.0.weight": torch.tensor(
@@ -48,17 +85,8 @@ dummy_file_system = {
             ],
             dtype=torch.float32,
         ),
-        "weight.weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
     },
-    "test_get_multi_weights_row": {
+    "test_get_weights_row": {
         "weight.weight": torch.tensor(
             [
                 [1, 2],
@@ -101,7 +129,7 @@ dummy_file_system = {
         "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
         "weight.s": torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
     },
-    "test_get_multi_weights_row_gptq": {
+    "test_get_weights_row_gptq": {
         "weight.qweight": torch.tensor(
             [
                 [1, 2],
@@ -200,7 +228,7 @@ dummy_file_system = {
         "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
         "weight.q_groups": torch.tensor([4], dtype=torch.int16),
     },
-    "test_get_multi_weights_row_exl2": {
+    "test_get_weights_row_exl2": {
         "weight.q_weight": torch.tensor(
             [
                 [1, 2],
@@ -245,7 +273,7 @@ dummy_file_system = {
         "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
         "weight.q_groups": torch.tensor([4], dtype=torch.int16),
     },
-    "test_get_multi_weights_row_marlin": {
+    "test_get_weights_row_marlin": {
         "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
         "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
     },
@@ -308,6 +336,7 @@ class MockWeights(Weights):
         dummy_fs,
         aliases: Optional[Dict[str, List[str]]] = None,
         prefix: Optional[str] = None,
+        weights_loader: Optional[WeightsLoader] = None,
     ):
         routing = {}
         self.dummy_fs = dummy_fs
@@ -327,6 +356,12 @@ class MockWeights(Weights):
         self.dtype = dtype
         self.process_group = process_group
         self.prefix = prefix
+        self.weights_loader = (
+            # We don't need to get linear layers, so just wrap raw tensors.
+            DefaultWeightsLoader(lambda x: x)
+            if weights_loader is None
+            else weights_loader
+        )
         self._handles = {}
 
     def _get_handle(self, filename: Union[Path, str]):
@@ -412,12 +447,10 @@ def test_get_weights_col_packed():
     )
 
     prefix = "weight"
-    quantize = None
     block_sizes = 1
 
     w = weights.get_weights_col_packed(
         prefix=prefix,
-        quantize=quantize,
         block_sizes=block_sizes,
     )
 
@@ -448,12 +481,10 @@ def test_get_weights_col_packed_block_size():
     )
 
     prefix = "weight"
-    quantize = None
     block_sizes = 2
 
     w = weights.get_weights_col_packed(
         prefix=prefix,
-        quantize=quantize,
         block_sizes=block_sizes,
     )
 
@@ -484,12 +515,10 @@ def test_get_weights_col_packed_block_size_arr():
     )
 
     prefix = "weight"
-    quantize = None
     block_sizes = [1, 1]
 
     w = weights.get_weights_col_packed(
         prefix=prefix,
-        quantize=quantize,
         block_sizes=block_sizes,
     )
 
@@ -519,11 +548,9 @@ def test_get_multi_weights_col():
     )
 
     prefixes = ["weight", "weight"]
-    quantize = None
 
     w = weights.get_multi_weights_col(
         prefixes=prefixes,
-        quantize=quantize,
         dim=0,
     )
 
@@ -545,10 +572,10 @@ def test_get_multi_weights_col():
     )
 
 
-def test_get_multi_weights_row():
+def test_get_weights_row():
     weights = MockWeights(
         [
-            "test_get_multi_weights_row",
+            "test_get_weights_row",
         ],
         device="cpu",
         dtype=torch.float32,
@@ -557,11 +584,9 @@ def test_get_multi_weights_row():
     )
 
     prefix = "weight"
-    quantize = None
 
-    w = weights.get_multi_weights_row(
+    w = weights.get_weights_row(
         prefix=prefix,
-        quantize=quantize,
     )
 
     assert torch.allclose(
@@ -576,7 +601,7 @@ def test_get_multi_weights_row():
 # test_get_weights_col
 
 
-def test_get_weights_col_awq():
+def test_get_weights_col_awq(gptq_weights_loader_awq):
     weights = MockWeights(
         [
             "test_get_weights_col_gptq",
@@ -585,14 +610,13 @@ def test_get_weights_col_awq():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader_awq,
     )
 
     prefix = "weight"
-    quantize = "awq"
 
     w = weights.get_weights_col(
         prefix=prefix,
-        quantize=quantize,
     )
 
     expected_weight = GPTQWeight(
@@ -605,6 +629,7 @@ def test_get_weights_col_awq():
         g_idx=None,
         bits=8.0,
         groupsize=2.0,
+        use_awq_kernel=True,
         use_exllama=False,
     )
 
@@ -614,10 +639,11 @@ def test_get_weights_col_awq():
     assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
     assert w.bits == expected_weight.bits, "bits mismatch"
     assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
     assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
 
 
-def test_get_weights_col_gtpq():
+def test_get_weights_col_gtpq(gptq_weights_loader):
     weights = MockWeights(
         [
             "test_get_weights_col_gptq",
@@ -626,14 +652,13 @@ def test_get_weights_col_gtpq():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader,
     )
 
     prefix = "weight"
-    quantize = "gptq"
 
     w = weights.get_weights_col(
         prefix=prefix,
-        quantize=quantize,
     )
 
     expected_weight = GPTQWeight(
@@ -643,6 +668,7 @@ def test_get_weights_col_gtpq():
         g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
         bits=8.0,
         groupsize=2.0,
+        use_awq_kernel=False,
         use_exllama=False,
     )
 
@@ -652,6 +678,7 @@ def test_get_weights_col_gtpq():
     assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
     assert w.bits == expected_weight.bits, "bits mismatch"
     assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
     assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
 
 
@@ -664,14 +691,13 @@ def test_get_weights_col_exl2():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=Exl2WeightsLoader(),
     )
 
     prefix = "weight"
-    quantize = "exl2"
 
     w = weights.get_weights_col(
         prefix=prefix,
-        quantize=quantize,
     )
 
     scaled_scale_max = 0.3906 * 256
@@ -692,7 +718,7 @@ def test_get_weights_col_exl2():
     assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
 
 
-def test_get_weights_col_marlin():
+def test_get_weights_col_marlin(marlin_weights_loader):
     weights = MockWeights(
         [
             "test_get_weights_col_marlin",
@@ -701,14 +727,13 @@ def test_get_weights_col_marlin():
         dtype=torch.float16,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=marlin_weights_loader,
     )
 
     prefix = "weight"
-    quantize = "marlin"
 
     w = weights.get_weights_col(
         prefix=prefix,
-        quantize=quantize,
     )
 
     expected_weight = MarlinWeight(
@@ -723,7 +748,7 @@ def test_get_weights_col_marlin():
 # test_get_weights_col_packed
 
 
-def test_get_weights_col_packed_awq():
+def test_get_weights_col_packed_awq(gptq_weights_loader_awq):
     weights = MockWeights(
         [
             "test_get_weights_col_packed_gptq",
@@ -732,15 +757,14 @@ def test_get_weights_col_packed_awq():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader_awq,
     )
 
     prefix = "weight"
-    quantize = "awq"
     block_sizes = 1
 
     w = weights.get_weights_col_packed(
         prefix=prefix,
-        quantize=quantize,
         block_sizes=block_sizes,
     )
 
@@ -751,6 +775,7 @@ def test_get_weights_col_packed_awq():
         g_idx=None,
         bits=8.0,
         groupsize=2.0,
+        use_awq_kernel=True,
         use_exllama=False,
     )
 
@@ -760,6 +785,7 @@ def test_get_weights_col_packed_awq():
     assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
     assert w.bits == expected_weight.bits, "bits mismatch"
     assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
     assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
 
 
@@ -773,15 +799,14 @@ def test_get_weights_col_packed_exl2():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=Exl2WeightsLoader(),
     )
 
     prefix = "weight"
-    quantize = "exl2"
     block_sizes = 1
 
     w = weights.get_weights_col_packed(
         prefix=prefix,
-        quantize=quantize,
         block_sizes=block_sizes,
     )
 
@@ -803,7 +828,7 @@ def test_get_weights_col_packed_exl2():
     assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
 
 
-def test_get_weights_col_packed_gptq():
+def test_get_weights_col_packed_gptq(gptq_weights_loader):
     weights = MockWeights(
         [
             "test_get_weights_col_packed_gptq",
@@ -812,14 +837,13 @@ def test_get_weights_col_packed_gptq():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader,
     )
 
     prefixes = ["weight"]
-    quantize = "gptq"
 
     w = weights.get_multi_weights_col(
         prefixes=prefixes,
-        quantize=quantize,
         dim=0,
     )
 
@@ -830,6 +854,7 @@ def test_get_weights_col_packed_gptq():
         g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
         bits=8.0,
         groupsize=2.0,
+        use_awq_kernel=False,
         use_exllama=False,
     )
 
@@ -839,10 +864,11 @@ def test_get_weights_col_packed_gptq():
     assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
     assert w.bits == expected_weight.bits, "bits mismatch"
     assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
     assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
 
 
-def test_get_weights_col_packed_marlin():
+def test_get_weights_col_packed_marlin(marlin_weights_loader):
     weights = MockWeights(
         [
             "test_get_weights_col_packed_marlin",
@@ -851,14 +877,13 @@ def test_get_weights_col_packed_marlin():
         dtype=torch.float16,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=marlin_weights_loader,
     )
 
     prefix = "weight"
-    quantize = "marlin"
 
     w = weights.get_multi_weights_col(
         prefixes=[prefix],
-        quantize=quantize,
         dim=0,
     )
 
@@ -876,7 +901,7 @@ def test_get_weights_col_packed_marlin():
 # test_get_multi_weights_col
 
 
-def test_get_multi_weights_col_awq():
+def test_get_multi_weights_col_awq(gptq_weights_loader_awq):
     weights = MockWeights(
         [
             "test_get_multi_weights_col_gptq",
@@ -885,14 +910,13 @@ def test_get_multi_weights_col_awq():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader_awq,
     )
 
     prefixes = ["weight"]
-    quantize = "awq"
 
     w = weights.get_multi_weights_col(
         prefixes=prefixes,
-        quantize=quantize,
         dim=0,
     )
 
@@ -903,6 +927,7 @@ def test_get_multi_weights_col_awq():
         g_idx=None,
         bits=8.0,
         groupsize=2.0,
+        use_awq_kernel=True,
         use_exllama=False,
     )
 
@@ -912,6 +937,7 @@ def test_get_multi_weights_col_awq():
     assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
     assert w.bits == expected_weight.bits, "bits mismatch"
     assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
     assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
 
 
@@ -924,22 +950,21 @@ def test_get_multi_weights_col_exl2():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=Exl2WeightsLoader(),
     )
 
     prefix = "weight"
-    quantize = "exl2"
 
     try:
-        w = weights.get_multi_weights_col(
+        weights.get_multi_weights_col(
             prefixes=[prefix],
-            quantize=quantize,
             dim=0,
         )
     except ValueError as e:
         assert e.args[0] == "get_multi_weights_col is not supported for exl2"
 
 
-def test_get_multi_weights_col_gptq():
+def test_get_multi_weights_col_gptq(gptq_weights_loader):
     weights = MockWeights(
         [
             "test_get_multi_weights_col_gptq",
@@ -948,14 +973,13 @@ def test_get_multi_weights_col_gptq():
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader,
     )
 
     prefixes = ["weight"]
-    quantize = "gptq"
 
     w = weights.get_multi_weights_col(
         prefixes=prefixes,
-        quantize=quantize,
         dim=0,
     )
 
@@ -966,6 +990,7 @@ def test_get_multi_weights_col_gptq():
         g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
         bits=8.0,
         groupsize=2.0,
+        use_awq_kernel=False,
         use_exllama=False,
     )
 
@@ -975,10 +1000,11 @@ def test_get_multi_weights_col_gptq():
     assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
     assert w.bits == expected_weight.bits, "bits mismatch"
     assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
     assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
 
 
-def test_get_multi_weights_col_marlin():
+def test_get_multi_weights_col_marlin(marlin_weights_loader):
     weights = MockWeights(
         [
             "test_get_multi_weights_col_marlin",
@@ -987,14 +1013,13 @@ def test_get_multi_weights_col_marlin():
         dtype=torch.float16,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=marlin_weights_loader,
     )
 
     prefix = "weight"
-    quantize = "marlin"
 
     w = weights.get_multi_weights_col(
         prefixes=[prefix],
-        quantize=quantize,
         dim=0,
     )
 
@@ -1007,26 +1032,25 @@ def test_get_multi_weights_col_marlin():
     assert torch.allclose(w.s, expected_weight.s), "s mismatch"
 
 
-# test_get_multi_weights_row
+# test_get_weights_row
 
 
-def test_get_multi_weights_row_awq():
+def test_get_weights_row_awq(gptq_weights_loader_awq):
     weights = MockWeights(
         [
-            "test_get_multi_weights_row_gptq",
+            "test_get_weights_row_gptq",
         ],
         device="cpu",
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader_awq,
     )
 
     prefix = "weight"
-    quantize = "awq"
 
-    w = weights.get_multi_weights_row(
+    w = weights.get_weights_row(
         prefix=prefix,
-        quantize=quantize,
     )
 
     expected_weight = GPTQWeight(
@@ -1036,6 +1060,7 @@ def test_get_multi_weights_row_awq():
         g_idx=None,
         bits=8.0,
         groupsize=2.0,
+        use_awq_kernel=True,
         use_exllama=False,
     )
 
@@ -1045,26 +1070,26 @@ def test_get_multi_weights_row_awq():
     assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
     assert w.bits == expected_weight.bits, "bits mismatch"
     assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
     assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
 
 
-def test_get_multi_weights_row_exl2():
+def test_get_weights_row_exl2():
     weights = MockWeights(
         [
-            "test_get_multi_weights_row_exl2",
+            "test_get_weights_row_exl2",
         ],
         device="cpu",
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=Exl2WeightsLoader(),
     )
 
     prefix = "weight"
-    quantize = "exl2"
 
-    w = weights.get_multi_weights_row(
+    w = weights.get_weights_row(
         prefix=prefix,
-        quantize=quantize,
     )
     print(w)
 
@@ -1086,23 +1111,22 @@ def test_get_multi_weights_row_exl2():
     assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
 
 
-def test_get_multi_weights_row_gptq():
+def test_get_weights_row_gptq(gptq_weights_loader):
     weights = MockWeights(
         [
-            "test_get_multi_weights_row_gptq",
+            "test_get_weights_row_gptq",
         ],
         device="cpu",
         dtype=torch.float32,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader,
     )
 
     prefix = "weight"
-    quantize = "gptq"
 
-    w = weights.get_multi_weights_row(
+    w = weights.get_weights_row(
         prefix=prefix,
-        quantize=quantize,
     )
 
     expected_weight = GPTQWeight(
@@ -1112,6 +1136,7 @@ def test_get_multi_weights_row_gptq():
         g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
         bits=8.0,
         groupsize=2.0,
+        use_awq_kernel=False,
         use_exllama=False,
     )
 
@@ -1121,26 +1146,26 @@ def test_get_multi_weights_row_gptq():
     assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
     assert w.bits == expected_weight.bits, "bits mismatch"
     assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
     assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
 
 
-def test_get_multi_weights_row_marlin():
+def test_get_weights_row_marlin(marlin_weights_loader):
     weights = MockWeights(
         [
-            "test_get_multi_weights_row_marlin",
+            "test_get_weights_row_marlin",
         ],
         device="cpu",
         dtype=torch.float16,
         process_group=dummy_process_group,
         dummy_fs=dummy_file_system,
+        weights_loader=marlin_weights_loader,
     )
 
     prefix = "weight"
-    quantize = "marlin"
 
-    w = weights.get_multi_weights_row(
+    w = weights.get_weights_row(
         prefix=prefix,
-        quantize=quantize,
     )
 
     expected_weight = MarlinWeight(
diff --git a/server/text_generation_server/adapters/config.py b/server/text_generation_server/adapters/config.py
index 5261d4b5092d3491960ab359968eb78f94e4f2c4..b7e2709001843529f1495fa025fefbe91dbd4e81 100644
--- a/server/text_generation_server/adapters/config.py
+++ b/server/text_generation_server/adapters/config.py
@@ -4,15 +4,12 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, Optional, Set, Tuple
+from typing import Dict, Set, Tuple
 
 import torch
 
 from text_generation_server.adapters.weights import AdapterWeights
 
-if TYPE_CHECKING:
-    from text_generation_server.models.model import Model
-
 
 @dataclass
 class ModuleMap:
@@ -31,14 +28,3 @@ class AdapterConfig(ABC):
         weight_names: Tuple[str],
     ) -> Tuple[ModuleMap, Set[str]]:
         pass
-
-    @abstractmethod
-    def load_batched_adapter_weights(
-        self,
-        model: "Model",
-        module_map: ModuleMap,
-        layer_type: str,
-        unused_weight_names: Set[str],
-        dynamic: bool,
-    ) -> Optional[AdapterWeights]:
-        pass
diff --git a/server/text_generation_server/adapters/lora.py b/server/text_generation_server/adapters/lora.py
index 87543be2b23df082e83dcb6790e1288d335a0d17..a00338e7ca550f9d0b1ea0f2d6fd67f5f7eec46d 100644
--- a/server/text_generation_server/adapters/lora.py
+++ b/server/text_generation_server/adapters/lora.py
@@ -4,7 +4,7 @@
 
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Type, Union
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
 from peft import LoraConfig as _LoraConfig
@@ -26,9 +26,6 @@ from text_generation_server.utils.sgmv import (
     use_cutlass_shrink,
 )
 
-if TYPE_CHECKING:
-    from text_generation_server.models.model import Model
-
 
 def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
     block_size = size // world_size
@@ -102,22 +99,6 @@ class LoraConfig(AdapterConfig):
             adapter_weight_names.add(lora_b_name)
         return module_map, adapter_weight_names
 
-    def load_batched_adapter_weights(
-        self,
-        model: "Model",
-        module_map: Dict[str, Dict],
-        layer_type: str,
-        unused_weight_names: Set[str],
-        dynamic: bool,
-    ) -> Optional[AdapterWeights]:
-        return LoraWeights.load(
-            self,
-            model,
-            module_map,
-            layer_type,
-            unused_weight_names,
-        )
-
     @classmethod
     def load(cls, adapter_id: str, api_token: str) -> "LoraConfig":
         hf_config = _LoraConfig.from_pretrained(adapter_id, token=api_token)
@@ -192,22 +173,38 @@ class LoraWeights(AdapterWeights):
     def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]:
         return [BatchLoraWeights]
 
+    # prepare pre-loaded lora weights for use in the model.
+    #
+    # this method processes and organizes lora weights for a specific layer type across all layers:
+    # - uses `config` (LoraConfig) to apply lora-specific settings like scaling factor.
+    # - retrieves weights from `module_map` based on the `layer_type`.
+    # - processes `nlayers` number of layers.
+    # - converts weights to the specified `dtype`.
+    # - shards weights across `world_size` number of processes using the `process_group`.
+    # - maps weights to specific layers using `target_to_layer`.
+    # - tracks `unused_weight_names` to identify any unused weights.
+    #
+    # the method handles weight transposition, scaling, and padding to ensure compatibility
+    # with SGMV or BGMV operations.
     @classmethod
-    def load(
+    def prepare_weights(
         cls,
         config: LoraConfig,
-        model: "Model",
         module_map: Dict[str, Dict],
         layer_type: str,
         unused_weight_names: Set[str],
+        nlayers: int,
+        dtype: torch.dtype,
+        world_size: int,
+        process_group: ProcessGroup,
+        target_to_layer: Dict[str, Tuple[str, torch.Tensor]],
     ) -> Optional[AdapterWeights]:
-        nlayers = model.get_num_layers_for_type(layer_type)
         lora_a_list = [None] * nlayers
         lora_b_list = [None] * nlayers
 
         for layer_id in range(nlayers):
             key = (layer_id, layer_type)
-            weight_name, layer = model.target_to_layer[key]
+            weight_name, layer = target_to_layer[key]
             base_weight = layer.base_layer.linear.weight
             base_device = base_weight.device
 
@@ -216,10 +213,10 @@ class LoraWeights(AdapterWeights):
                 return None
 
             lora_a, lora_a_name = module_map[weight_name]["lora_A"]
-            lora_a = lora_a.to(base_device, model.dtype)
+            lora_a = lora_a.to(base_device, dtype)
 
             lora_b, lora_b_name = module_map[weight_name]["lora_B"]
-            lora_b = lora_b.to(base_device, model.dtype)
+            lora_b = lora_b.to(base_device, dtype)
 
             scale = get_scaling_factor(
                 config.lora_alpha,
@@ -236,12 +233,8 @@ class LoraWeights(AdapterWeights):
             lora_b_list[layer_id] = lora_b.transpose(0, 1) * scale
 
         # pad lora ranks to be compatible with sgmv
-        lora_a_list = [
-            pad_rank(w, dim=1, world_size=model.world_size) for w in lora_a_list
-        ]
-        lora_b_list = [
-            pad_rank(w, dim=0, world_size=model.world_size) for w in lora_b_list
-        ]
+        lora_a_list = [pad_rank(w, dim=1, world_size=world_size) for w in lora_a_list]
+        lora_b_list = [pad_rank(w, dim=0, world_size=world_size) for w in lora_b_list]
 
         if lora_a_list:
             # update rank if it was padded
@@ -252,8 +245,8 @@ class LoraWeights(AdapterWeights):
             *shard_lora_weights(
                 weights_a=lora_a_list,
                 weights_b=lora_b_list,
-                split_dim=0 if model.is_row_parallel(layer_type) else 1,
-                process_group=model.process_group,
+                split_dim=0 if layer_type in {"o_proj", "down_proj", "lm_head"} else 1,
+                process_group=process_group,
             ),
             config,
         )
@@ -293,10 +286,6 @@ class BatchLoraWeights(BatchAdapterWeights):
             for rank_data in self.rank_data.values()
         )
 
-    @classmethod
-    def key(cls) -> str:
-        return "lora"
-
     @classmethod
     def load(
         self,
diff --git a/server/text_generation_server/adapters/weights.py b/server/text_generation_server/adapters/weights.py
index 8f6587567de3fe78d1e1c624bb3ee0f8d18ccd6f..da75dbcdf98bfd37a69441ade67c3e7960ea0e26 100644
--- a/server/text_generation_server/adapters/weights.py
+++ b/server/text_generation_server/adapters/weights.py
@@ -42,10 +42,6 @@ class BatchAdapterWeights(ABC):
     def has_adapter(self, adapter_index: int) -> bool:
         pass
 
-    @abstractclassmethod
-    def key(cls) -> str:
-        pass
-
     @abstractclassmethod
     def load(
         cls,
@@ -71,13 +67,6 @@ class LayerAdapterWeights:
             return
         del self.adapter_weights[adapter_idx]
 
-    @property
-    def max_speculative_tokens(self) -> int:
-        return max(
-            adapter_weights.speculative_tokens
-            for adapter_weights in self.adapter_weights.values()
-        )
-
     def is_empty(self) -> bool:
         return len(self.adapter_weights) == 0
 
@@ -101,7 +90,7 @@ class LayerAdapterWeights:
                 adapter_weights, meta, prefill, prefill_head_indices
             )
             if batched_weights is not None:
-                batch_data[batch_type.key()] = batched_weights
+                batch_data = batched_weights
         return batch_data
 
 
@@ -133,8 +122,7 @@ class AdapterBatchData:
     def ranks(self) -> Set[int]:
         # TODO(travis): refactor to be less coupled to lora implementation
         ranks = set()
-        for layer_data in self.data.values():
-            lora_data = layer_data.get("lora")
+        for lora_data in self.data.values():
             if lora_data is None:
                 continue
 
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index 68ae95dd7e0715637961e028089319be97ec0ce0..a363b33a89a1d9606a9f7e79b75e4c54606a2379 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -7,6 +7,7 @@ from loguru import logger
 from typing import Optional
 from enum import Enum
 from huggingface_hub import hf_hub_download
+from text_generation_server.utils.adapter import parse_lora_adapters
 
 
 app = typer.Typer()
@@ -29,6 +30,11 @@ class Dtype(str, Enum):
     bloat16 = "bfloat16"
 
 
+class KVCacheDtype(str, Enum):
+    fp8_e4m3fn = "fp8_e4m3fn"
+    fp8_e5m2 = "fp8_e5m2"
+
+
 @app.command()
 def serve(
     model_id: str,
@@ -37,6 +43,7 @@ def serve(
     quantize: Optional[Quantization] = None,
     speculate: Optional[int] = None,
     dtype: Optional[Dtype] = None,
+    kv_cache_dtype: Optional[KVCacheDtype] = None,
     trust_remote_code: bool = False,
     uds_path: Path = "/tmp/text-generation-server",
     logger_level: str = "INFO",
@@ -79,21 +86,24 @@ def serve(
     if otlp_endpoint is not None:
         setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)
 
-    lora_adapter_ids = os.getenv("LORA_ADAPTERS", None)
+    lora_adapters = parse_lora_adapters(os.getenv("LORA_ADAPTERS"))
 
-    # split on comma and strip whitespace
-    lora_adapter_ids = (
-        [x.strip() for x in lora_adapter_ids.split(",")] if lora_adapter_ids else []
-    )
+    # TODO: enable lora with cuda graphs. for now disable cuda graphs if lora is enabled
+    # and warn the user
+    if lora_adapters:
+        logger.warning("LoRA adapters enabled (experimental feature).")
 
-    if len(lora_adapter_ids) > 0:
-        logger.warning(
-            f"LoRA adapters are enabled. This is an experimental feature and may not work as expected."
-        )
+        if "CUDA_GRAPHS" in os.environ:
+            logger.warning(
+                "LoRA adapters incompatible with CUDA Graphs. Disabling CUDA Graphs."
+            )
+            global CUDA_GRAPHS
+            CUDA_GRAPHS = None
 
     # Downgrade enum into str for easier management later on
     quantize = None if quantize is None else quantize.value
     dtype = None if dtype is None else dtype.value
+    kv_cache_dtype = None if kv_cache_dtype is None else kv_cache_dtype.value
     if dtype is not None and quantize not in {
         None,
         "bitsandbytes",
@@ -105,12 +115,13 @@ def serve(
         )
     server.serve(
         model_id,
-        lora_adapter_ids,
+        lora_adapters,
         revision,
         sharded,
         quantize,
         speculate,
         dtype,
+        kv_cache_dtype,
         trust_remote_code,
         uds_path,
         max_input_tokens,
@@ -161,7 +172,7 @@ def download_weights(
         # currently by default we don't merge the weights with the base model
         if merge_lora:
             try:
-                adapter_config_filename = hf_hub_download(
+                hf_hub_download(
                     model_id, revision=revision, filename="adapter_config.json"
                 )
                 utils.download_and_unload_peft(
@@ -281,9 +292,9 @@ def download_weights(
     if auto_convert:
         if not trust_remote_code:
             logger.warning(
-                f"🚨🚨BREAKING CHANGE in 2.0🚨🚨: Safetensors conversion is disabled without `--trust-remote-code` because "
-                f"Pickle files are unsafe and can essentially contain remote code execution!"
-                f"Please check for more information here: https://huggingface.co/docs/text-generation-inference/basic_tutorials/safety",
+                "🚨🚨BREAKING CHANGE in 2.0🚨🚨: Safetensors conversion is disabled without `--trust-remote-code` because "
+                "Pickle files are unsafe and can essentially contain remote code execution!"
+                "Please check for more information here: https://huggingface.co/docs/text-generation-inference/basic_tutorials/safety",
             )
 
         logger.warning(
@@ -315,7 +326,7 @@ def download_weights(
             # Name for this varible depends on transformers version.
             discard_names = getattr(class_, "_tied_weights_keys", [])
 
-        except Exception as e:
+        except Exception:
             discard_names = []
         # Convert pytorch weights to safetensors
         utils.convert_files(local_pt_files, local_st_files, discard_names)
@@ -332,6 +343,7 @@ def quantize(
     upload_to_model_id: Optional[str] = None,
     percdamp: float = 0.01,
     act_order: bool = False,
+    groupsize: int = 128,
 ):
     if revision is None:
         revision = "main"
@@ -346,13 +358,14 @@ def quantize(
     quantize(
         model_id=model_id,
         bits=4,
-        groupsize=128,
+        groupsize=groupsize,
         output_dir=output_dir,
         revision=revision,
         trust_remote_code=trust_remote_code,
         upload_to_model_id=upload_to_model_id,
         percdamp=percdamp,
         act_order=act_order,
+        sym=True,
     )
 
 
diff --git a/server/text_generation_server/interceptor.py b/server/text_generation_server/interceptor.py
index 57df172575a17d92c712215e3f70924476a86c4c..a5c023e4ecf6cd8dfd3faaba53def290bafde9b6 100644
--- a/server/text_generation_server/interceptor.py
+++ b/server/text_generation_server/interceptor.py
@@ -9,6 +9,9 @@ from typing import Callable, Any
 
 
 class ExceptionInterceptor(AsyncServerInterceptor):
+    def __init__(self, shutdown_callback):
+        self.shutdown_callback = shutdown_callback
+
     async def intercept(
         self,
         method: Callable,
@@ -25,7 +28,7 @@ class ExceptionInterceptor(AsyncServerInterceptor):
 
             # Runtime Error cannot be recovered from
             if isinstance(err, RuntimeError):
-                exit(1)
+                self.shutdown_callback()
 
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
diff --git a/server/text_generation_server/layers/__init__.py b/server/text_generation_server/layers/__init__.py
index 32c8d121bfaf8f7c7153ff0094113e0979d29820..0000ca915fd8543ef9b376b8b7cf6fec22bbc030 100644
--- a/server/text_generation_server/layers/__init__.py
+++ b/server/text_generation_server/layers/__init__.py
@@ -18,3 +18,17 @@ from text_generation_server.layers.lora import (
     TensorParallelMultiAdapterLinear,
     TensorParallelAdapterRowLinear,
 )
+
+__all__ = [
+    "get_linear",
+    "FastLinear",
+    "TensorParallelColumnLinear",
+    "TensorParallelRowLinear",
+    "TensorParallelEmbedding",
+    "SpeculativeHead",
+    "LoraLinear",
+    "TensorParallelMultiAdapterLinear",
+    "TensorParallelAdapterRowLinear",
+    "load_layer_norm",
+    "load_conv2d",
+]
diff --git a/server/text_generation_server/layers/attention/__init__.py b/server/text_generation_server/layers/attention/__init__.py
index c8bccefec899f5079010f73cfa31d9aa179edb7a..ebe32042c46f63355bfb012abe8ce69a0c907ac6 100644
--- a/server/text_generation_server/layers/attention/__init__.py
+++ b/server/text_generation_server/layers/attention/__init__.py
@@ -1,15 +1,40 @@
-from text_generation_server.utils.import_utils import SYSTEM
 import os
 
+from text_generation_server.utils.import_utils import SYSTEM
+
 from .common import Seqlen
 
 if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
     raise ImportError("`USE_FLASH_ATTENTION` is false.")
 if SYSTEM == "cuda":
-    from .cuda import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
+    from .cuda import (
+        SUPPORTS_WINDOWING,
+        attention,
+        paged_attention,
+    )
 elif SYSTEM == "rocm":
-    from .rocm import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
+    from .rocm import (
+        SUPPORTS_WINDOWING,
+        attention,
+        paged_attention,
+    )
 elif SYSTEM == "ipex":
-    from .ipex import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
+    from .ipex import (
+        SUPPORTS_WINDOWING,
+        attention,
+        paged_attention,
+    )
 else:
     raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
+
+# KVCache needs `reshape_and_cache`, so ensure that it is defined already.
+from .kv_cache import KVCache, get_kv_scales
+
+__all__ = [
+    "attention",
+    "get_kv_scales",
+    "paged_attention",
+    "SUPPORTS_WINDOWING",
+    "KVCache",
+    "Seqlen",
+]
diff --git a/server/text_generation_server/layers/attention/common.py b/server/text_generation_server/layers/attention/common.py
index bd0717ce30f3c45daef3aa78b6c301702d325a2d..a3b919ee4fdbc8e866cf71f0d2f0f9c289b31ea4 100644
--- a/server/text_generation_server/layers/attention/common.py
+++ b/server/text_generation_server/layers/attention/common.py
@@ -1,44 +1,52 @@
 from dataclasses import dataclass
-from text_generation_server.models.globals import FLASH_DECODING
 import torch
 from typing import Optional
 
 
-if FLASH_DECODING:
-
-    @dataclass
-    class Seqlen:
-        input_lengths: torch.Tensor
-        cu_seqlen_q: Optional[torch.Tensor]
-        cu_seqlen_k: Optional[torch.Tensor]
-
-        def __init__(self, input_lengths):
-            self.input_lengths = input_lengths
-            device = self.input_lengths.device
-            shape = self.input_lengths.shape
+@dataclass
+class Seqlen:
+    input_lengths: torch.Tensor
+    cache_lengths: torch.Tensor
+    cu_seqlen_q: Optional[torch.Tensor]
+    cu_seqlen_k: Optional[torch.Tensor]
+    max_q: int
+    max_k: int
+
+    def __init__(
+        self,
+        input_lengths,
+        cache_lengths,
+        cu_seqlen_q=None,
+        max_q=None,
+        max_k=None,
+    ):
+        self.input_lengths = input_lengths
+        self.cache_lengths = cache_lengths
+        device = self.input_lengths.device
+        shape = self.input_lengths.shape
+        if cu_seqlen_q is None:
             cu_seqlen_q = torch.arange(
                 shape[0] + 1,
                 device=device,
                 dtype=torch.int32,
             )
-            cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
-            # cuda graphs don't like this and this is necessary to clamp within mistral
-            # Although FA2 might not want the clamping
-            # cu_seqlen_k[0] = 0
-            torch.cumsum(self.input_lengths, -1, out=cu_seqlen_k[1:])
-
-            self.cu_seqlen_q = cu_seqlen_q
-            self.cu_seqlen_k = cu_seqlen_k
-
-        def clamp(self, max):
-            # Flash decoding doesn't need to clamp
-            return self
-
-else:
-
-    @dataclass
-    class Seqlen:
-        input_lengths: torch.Tensor
-
-        def clamp(self, max):
-            return Seqlen(torch.clamp(self.input_lengths, max=max))
+            max_q = 1
+        else:
+            assert max_q is not None
+        assert max_k is not None
+        cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
+
+        # cuda graphs don't like this and this is necessary to clamp within mistral
+        # Although FA2 might not want the clamping
+        # cu_seqlen_k[0] = 0
+        total = self.input_lengths + self.cache_lengths
+        torch.cumsum(total, -1, out=cu_seqlen_k[1:])
+
+        self.cu_seqlen_q = cu_seqlen_q
+        self.cu_seqlen_k = cu_seqlen_k
+        self.max_q = max_q
+        self.max_k = max_k
+
+    def clamp(self, max):
+        # Flash decoding doesn't need to clamp
+        return self
diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
index 94b69899ef503a30db4635a5d2fd1f4d472326b8..d705afb0bd284aa83aaf47b39350e98167669059 100644
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -1,48 +1,30 @@
 import torch
+from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
 from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.models.globals import FLASH_DECODING, BLOCK_SIZE
+from text_generation_server.models.globals import (
+    ATTENTION,
+    BLOCK_SIZE,
+)
 from text_generation_server.layers.attention import Seqlen
+from typing import Optional
+
 
 major, minor = torch.cuda.get_device_capability()
 is_sm75 = major == 7 and minor == 5
 _PARTITION_SIZE = 512
 
-try:
-    from vllm._C import cache_ops
-    from vllm._C import ops
-except Exception as e:
-    raise ImportError(
-        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
-    )
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    if FLASH_DECODING:
-        shape = key_cache.shape
-        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
-        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
-    else:
-        cache_ops.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0
-        )
-
 
 def paged_attention(
-    out: torch.Tensor,
     query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: KVCache,
     kv_head_mapping: torch.Tensor,
     softmax_scale: float,
     block_tables: torch.Tensor,
     seqlen: Seqlen,
     max_s: int,
+    *,
+    kv_scales: KVScales,
+    softcap: Optional[float] = None,
 ):
     # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
     # Copyright 2023 The vLLM team. All rights
@@ -67,12 +49,26 @@ def paged_attention(
     num_seqs, num_heads, head_size = query.shape
     max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
 
+    can_scale = kv_cache.can_scale(kv_scales)
+
     # NOTE(woosuk): We use a simple heuristic to decide whether to use
     # PagedAttention V1 or V2. If the number of partitions is 1, we use
     # V1 to avoid the overhead of reduction. Also, if the number of
     # sequences or heads is large, we use V1 since there is enough work
     # to parallelize.
-    if FLASH_DECODING:
+    if ATTENTION == "flashinfer":
+        from text_generation_server.layers.attention.flashinfer import decode_state
+
+        return decode_state.get().forward(
+            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
+            query.contiguous(),
+            paged_kv_cache=(kv_cache.key, kv_cache.value),
+            logits_soft_cap=softcap,
+            sm_scale=softmax_scale,
+            k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
+            v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
+        )
+    elif ATTENTION == "flashdecoding":
         max_q = 1
         max_k = max_s
         import flash_attn_2_cuda
@@ -82,13 +78,16 @@ def paged_attention(
         # by the current path
         # https://github.com/Dao-AILab/flash-attention/blob/320fb59487658f033f56711efd3d61b7c7a6f8f3/csrc/flash_attn/flash_api.cpp#L577
         # This fails becuase we're using causal, therefore window_right is set to 0 and the split logic is never applied.
-        out2 = flash_attn_2_cuda.varlen_fwd(
+        if softcap is None:
+            softcap = 0.0
+        out = flash_attn_2_cuda.varlen_fwd(
             query,
-            key_cache,
-            value_cache,
+            kv_cache.key,
+            kv_cache.value,
             None,
             seqlen.cu_seqlen_q,
             seqlen.cu_seqlen_k,
+            None,  # pad_k
             None,
             block_tables,
             None,
@@ -100,14 +99,19 @@ def paged_attention(
             True,  # causal
             -1,  # Window_left
             -1,  # Window right
+            softcap,
             False,  # return softmax
             None,  # generator
         )
-        return out2[0]
+        return out[0]
     else:
-        input_lengths = seqlen.input_lengths
+        if softcap is not None:
+            raise RuntimeError("Paged attention doesn't support softcapping")
+        input_lengths = seqlen.input_lengths + seqlen.cache_lengths
         from vllm._C import ops
 
+        out = torch.empty_like(query)
+
         use_v1 = max_s <= 8192 and (
             max_num_partitions == 1 or num_seqs * num_heads > 512
         )
@@ -115,8 +119,8 @@ def paged_attention(
             ops.paged_attention_v1(
                 out,
                 query,
-                key_cache,
-                value_cache,
+                kv_cache.key,
+                kv_cache.value,
                 kv_head_mapping,
                 softmax_scale,
                 block_tables,
@@ -148,8 +152,8 @@ def paged_attention(
                 max_logits,
                 tmp_output,
                 query,
-                key_cache,
-                value_cache,
+                kv_cache.key,
+                kv_cache.value,
                 kv_head_mapping,
                 softmax_scale,
                 block_tables,
@@ -164,6 +168,10 @@ def paged_attention(
 
 
 try:
+    is_ampere_or_newer = major >= 8 and minor >= 0
+    if not is_ampere_or_newer:
+        raise ImportError("FlashAttention only supports Ampere GPUs or newer.")
+
     import flash_attn_2_cuda
 
     V2 = True
@@ -192,101 +200,140 @@ except ImportError:
             ) from e
 
 
+if ATTENTION == "flashdecoding" and not V2:
+    raise ValueError("Flash decoding requires Flash Attention V2")
+
 SUPPORTS_WINDOWING = V2
-if V2:
-
-    def attention(
-        q,
-        k,
-        v,
-        out,
-        cu_seqlens,
-        max_s,
-        softmax_scale,
-        window_size_left=-1,
-        causal=True,
-    ):
+
+
+def attention(
+    *,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: KVCache,
+    kv_scales: KVScales,
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    softmax_scale: float,
+    window_size_left: int = -1,
+    causal: bool = True,
+    softcap: Optional[float] = None,
+):
+    can_scale = kv_cache.can_scale(kv_scales)
+
+    if ATTENTION == "flashinfer":
+        from text_generation_server.layers.attention.flashinfer import (
+            prefill_with_paged_kv_state,
+        )
+
+        if softcap is None:
+            softcap = 0.0
+
+        return prefill_with_paged_kv_state.get().forward(
+            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
+            query.contiguous(),
+            causal=causal,
+            paged_kv_cache=(kv_cache.key, kv_cache.value),
+            logits_soft_cap=softcap,
+            sm_scale=softmax_scale,
+            window_left=window_size_left,
+            k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
+            v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
+        )
+
+    # If we are using flashdecoding or paged, we always use flash-attn for
+    # the prefill. We have to branch on whether we use flash-attn v1 or v2.
+    elif V2:
+        out = torch.empty_like(query)
         if window_size_left <= 0 and window_size_left != -1:
             raise ValueError("`window_size_left` must be > 0 or -1")
+
+        if softcap is None:
+            softcap = 0.0
+
         return flash_attn_2_cuda.varlen_fwd(
-            q,
-            k,
-            v,
+            query,
+            # flashdecoding: pass the KV caches, paged: pass the KV.
+            kv_cache.key if ATTENTION == "flashdecoding" else key,
+            kv_cache.value if ATTENTION == "flashdecoding" else value,
             out,
-            cu_seqlens,
-            cu_seqlens,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_k,
             None,
             None,
+            block_tables if ATTENTION == "flashdecoding" else None,
             None,
-            max_s,
-            max_s,
+            seqlen.max_q,
+            seqlen.max_k,
             0.0,
             softmax_scale,
             False,
             causal,
             window_size_left,
             0,
+            softcap,
             False,
             None,
-        )
+        )[0]
 
-else:
-
-    def attention(
-        q,
-        k,
-        v,
-        out,
-        cu_seqlens,
-        max_s,
-        softmax_scale,
-        window_size_left=-1,
-    ):
+    else:
         if window_size_left != -1:
             raise NotImplementedError(
                 "window_size_left is only available with flash attn v2"
             )
+        if softcap is not None:
+            raise NotImplementedError("softcap is not available in flash attn v1")
 
         # Flash attention v1 requires q, k and v to have the same number of heads
-        if k.shape[1] != q.shape[1]:
+        if key.shape[1] != query.shape[1]:
             # MQA expand
-            if k.shape[1] == 1:
-                k = k.expand(-1, q.shape[1], -1)
+            if key.shape[1] == 1:
+                key = key.expand(-1, query.shape[1], -1)
             # Grouped attention reshape
             else:
-                original_shape = k.shape
-                k = (
-                    k.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
+                original_shape = key.shape
+                key = (
+                    key.unsqueeze(2)
+                    .expand(-1, -1, query.shape[1] // key.shape[1], -1)
                     .reshape(original_shape[0], -1, original_shape[2])
                 )
-        if v.shape[1] != q.shape[1]:
+        if value.shape[1] != query.shape[1]:
             # MQA expand
-            if v.shape[1] == 1:
-                v = v.expand(-1, q.shape[1], -1)
+            if value.shape[1] == 1:
+                value = value.expand(-1, query.shape[1], -1)
             # Grouped attention reshape
             else:
-                original_shape = v.shape
-                v = (
-                    v.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
+                original_shape = value.shape
+                value = (
+                    value.unsqueeze(2)
+                    .expand(-1, -1, query.shape[1] // value.shape[1], -1)
                     .reshape(original_shape[0], -1, original_shape[2])
                 )
 
-        return flash_attn_cuda.fwd(
-            q,
-            k,
-            v,
+        out = torch.empty_like(query)
+        flash_attn_cuda.fwd(
+            query,
+            key,
+            value,
             out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_q,
+            seqlen.max_q,
+            seqlen.max_k,
             0.0,
             softmax_scale,
             False,
-            True,
+            causal,
             False,
             0,
             None,
         )
+        return out
+
+
+__all__ = [
+    "SUPPORTS_WINDOWING",
+    "attention",
+    "paged_attention",
+]
diff --git a/server/text_generation_server/layers/attention/flash_attn_triton.py b/server/text_generation_server/layers/attention/flash_attn_triton.py
index 3fe3223110cfbe7732ecb7d237018dcd9c09bdc8..fd180f0f19ab3ec05f6e4cd566d91a0385a81ab8 100644
--- a/server/text_generation_server/layers/attention/flash_attn_triton.py
+++ b/server/text_generation_server/layers/attention/flash_attn_triton.py
@@ -699,7 +699,6 @@ def check_args(
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(
         ctx,
@@ -747,11 +746,8 @@ class _attention(torch.autograd.Function):
         padded_d_model = 1 << (head_size - 1).bit_length()
         padded_d_model = max(padded_d_model, 16)
 
-        grid = lambda META: (
-            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
-            nheads_q,
-            batch,
-        )
+        def grid(META):
+            return triton.cdiv(max_seqlens_q, META["BLOCK_M"]), nheads_q, batch
 
         encoded_softmax = None
 
diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..26a72d9be71a910731cb4169704f1410b221f0cc
--- /dev/null
+++ b/server/text_generation_server/layers/attention/flashinfer.py
@@ -0,0 +1,252 @@
+from typing import Optional
+from contextvars import ContextVar
+from contextlib import contextmanager
+
+import flashinfer
+import torch
+
+prefill_state: ContextVar[flashinfer.BatchPrefillWithRaggedKVCacheWrapper] = ContextVar(
+    "prefill_state"
+)
+
+prefill_with_paged_kv_state: ContextVar[
+    flashinfer.BatchPrefillWithPagedKVCacheWrapper
+] = ContextVar("prefill_with_paged_kv_state")
+
+decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = ContextVar(
+    "decode_state"
+)
+
+workspace: Optional[torch.Tensor] = None
+
+
+def get_workspace(device):
+    """Get shared flashinfer workspace."""
+    global workspace
+    if workspace is None:
+        workspace = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device=device)
+    return workspace
+
+
+def create_prefill_with_paged_kv_state(
+    *,
+    device: torch.device,
+):
+    """Create a prefill state that uses the KV cache."""
+    workspace_buffer = get_workspace(device)
+    return flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, kv_layout="NHD", use_cuda_graph=False
+    )
+
+
+@contextmanager
+def use_prefill_with_paged_kv_state(
+    *,
+    state: flashinfer.BatchPrefillWithPagedKVCacheWrapper,
+    block_tables: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    input_lengths: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    page_size: int,
+    dtype: torch.dtype,
+    window_left: int,
+):
+    """
+    Context manager to set the active flashinfer prefill state to the given
+    `state` and parameters. This state will be used by all calls to the
+    `attention` function while the context manager is active.
+    """
+
+    indptr = torch.zeros(
+        input_lengths.shape[0] + 1, device=input_lengths.device, dtype=torch.int32
+    )
+    # Round up to page size and then calculate the cumulative sum to get
+    # the indices into the block table.
+    torch.add(input_lengths, page_size - 1, out=indptr[1:])
+    indptr[1:].div_(page_size, rounding_mode="floor")
+    indptr[1:].cumsum_(-1)
+
+    # Get the lengths of the last page in a block.
+    if page_size == 1:
+        last_page_len = torch.ones(
+            input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
+        )
+    else:
+        last_page_len = torch.empty(
+            input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
+        )
+        torch.sub(input_lengths, 1, out=last_page_len)
+        last_page_len.remainder_(page_size)
+        last_page_len += 1
+
+    token = prefill_with_paged_kv_state.set(state)
+    try:
+        state.begin_forward(
+            qo_indptr=cu_seqlens,
+            paged_kv_indptr=indptr,
+            paged_kv_indices=block_tables,
+            paged_kv_last_page_len=last_page_len,
+            num_qo_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_size,
+            q_data_type=dtype,
+            page_size=page_size,
+            window_left=window_left,
+        )
+        yield
+    finally:
+        state.end_forward()
+        if token is not None:
+            prefill_with_paged_kv_state.reset(token)
+
+
+def create_prefill_state(
+    *,
+    device: torch.device,
+):
+    """Create a prefill state."""
+    workspace_buffer = get_workspace(device)
+    return flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
+        workspace_buffer, kv_layout="NHD", use_cuda_graph=False
+    )
+
+
+@contextmanager
+def use_prefill_state(
+    *,
+    state: flashinfer.BatchPrefillWithRaggedKVCacheWrapper,
+    cu_seqlens: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    window_left: int,
+):
+    """
+    Context manager to set the active flashinfer prefill state to the given
+    `state` and parameters. This state will be used by all calls to the
+    `attention` function while the context manager is active.
+    """
+
+    token = prefill_state.set(state)
+    try:
+        state.begin_forward(
+            qo_indptr=cu_seqlens,
+            kv_indptr=cu_seqlens,
+            num_qo_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_size,
+            q_data_type=dtype,
+            window_left=window_left,
+        )
+        yield
+    finally:
+        state.end_forward()
+        if token is not None:
+            prefill_state.reset(token)
+
+
+def create_decode_state(
+    *,
+    device: torch.device,
+    num_heads: int,
+    num_kv_heads: int,
+):
+    """Create a decode state."""
+    workspace_buffer = get_workspace(device)
+    num_groups = num_heads // num_kv_heads
+    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout="NHD",
+        use_cuda_graph=False,
+        # Taken from https://github.com/flashinfer-ai/flashinfer/blob/33ef95700981ba70f4cab63b8931e562bc795b21/python/flashinfer/decode.py#L57-L60
+        use_tensor_cores=num_groups not in [1, 2, 4, 8],
+    )
+
+
+def create_decode_state_cuda_graphs(
+    *,
+    device: torch.device,
+    block_tables: torch.Tensor,
+    block_tables_ptr: torch.Tensor,
+    last_page_len: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+):
+    """
+    Create a decode state for use with CUDA Graphs. `block_tables`,
+    `block_tables_ptr`, and `last_page_len` are used in CUDA Graphs and are
+    therefore stored as part of the state.
+    """
+    workspace_buffer = get_workspace(device)
+    num_groups = num_heads // num_kv_heads
+    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout="NHD",
+        use_cuda_graph=True,
+        paged_kv_indices_buffer=block_tables,
+        paged_kv_indptr_buffer=block_tables_ptr,
+        paged_kv_last_page_len_buffer=last_page_len,
+        # Taken from https://github.com/flashinfer-ai/flashinfer/blob/33ef95700981ba70f4cab63b8931e562bc795b21/python/flashinfer/decode.py#L57-L60
+        use_tensor_cores=num_groups not in [1, 2, 4, 8],
+    )
+
+
+@contextmanager
+def use_decode_state(
+    *,
+    state: flashinfer.BatchDecodeWithPagedKVCacheWrapper,
+    input_lengths: torch.Tensor,
+    block_tables: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    page_size: int,
+    kv_cache_dtype: torch.dtype,
+    dtype: torch.dtype,
+    window_left: int,
+):
+    """
+    Context manager to set the active flashinfer decoding state to the given
+    `state` and parameters. This state will be used by all calls to the
+    `paged_attention` function while the context manager is active.
+    """
+    indptr = torch.zeros(
+        input_lengths.shape[0] + 1, device=input_lengths.device, dtype=torch.int32
+    )
+    # Round up to page size and then calculate the cumulative sum to get
+    # the indices into the block table.
+    torch.add(input_lengths, page_size - 1, out=indptr[1:])
+    indptr[1:].div_(page_size, rounding_mode="floor")
+    indptr[1:].cumsum_(-1)
+
+    # Get the lengths of the last page in a block.
+    last_page_len = torch.empty(
+        input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
+    )
+    torch.sub(input_lengths, 1, out=last_page_len)
+    last_page_len.remainder_(page_size)
+    last_page_len += 1
+
+    token = decode_state.set(state)
+
+    try:
+        state.begin_forward(
+            indptr=indptr,
+            indices=block_tables,
+            last_page_len=last_page_len,
+            num_qo_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_size,
+            page_size=page_size,
+            data_type=kv_cache_dtype,
+            q_data_type=dtype,
+            window_left=window_left,
+        )
+        yield
+    finally:
+        state.end_forward()
+        if token is not None:
+            decode_state.reset(token)
diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py
index 45a0a03ecaf25adc5ebe0f65b3e2d8b6f989a1fe..677f3f5647d610ec5997e6588a0792fd1f871693 100644
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@@ -1,32 +1,42 @@
 import intel_extension_for_pytorch as ipex
 import torch
+from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
 from text_generation_server.models.flash_causal_lm import BLOCK_SIZE
 from text_generation_server.layers.attention import Seqlen
+from typing import Optional
 
 SUPPORTS_WINDOWING = False
 
 
 def attention(
-    q,
-    k,
-    v,
-    out,
-    cu_seqlens,
-    max_s,
-    softmax_scale,
-    window_size_left=-1,
-    causal=True,
+    *,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: KVCache,
+    kv_scales: KVScales,
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    softmax_scale: float,
+    window_size_left: int = -1,
+    causal: bool = True,
+    softcap: Optional[float] = None,
 ):
+    if softcap is not None:
+        raise NotImplementedError("softcap is not available in IPEX")
+
+    out = torch.empty_like(query)
+
     # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-    return ipex.llm.functional.varlen_attention(
-        q,
-        k,
-        v,
+    ipex.llm.functional.varlen_attention(
+        query.contiguous() if query.device.type == "xpu" else query,
+        key.contiguous() if key.device.type == "xpu" else key,
+        value.contiguous() if value.device.type == "xpu" else value,
         out,
-        cu_seqlens,
-        cu_seqlens,
-        max_s,
-        max_s,
+        seqlen.cu_seqlen_q,
+        seqlen.cu_seqlen_q,
+        seqlen.max_q,
+        seqlen.max_q,
         0.0,
         softmax_scale,
         False,
@@ -35,41 +45,44 @@ def attention(
         None,
     )
 
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    ipex.llm.modules.PagedAttention.reshape_and_cache(
-        key, value, key_cache, value_cache, slots
-    )
+    return out
 
 
 def paged_attention(
-    out: torch.Tensor,
     query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: KVCache,
     kv_head_mapping: torch.Tensor,
     softmax_scale: float,
     block_tables: torch.Tensor,
     seqlen: Seqlen,
     max_s: int,
+    *,
+    kv_scales: KVScales,
+    softcap: Optional[float] = None,
 ):
+    if softcap is not None:
+        raise NotImplementedError("softcap is not available in IPEX")
+
+    out = torch.empty_like(query)
+    input_lengths = seqlen.input_lengths + seqlen.cache_lengths
     ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
         out,
         query,
-        key_cache,
-        value_cache,
+        kv_cache.key,
+        kv_cache.value,
         kv_head_mapping,
         softmax_scale,
         block_tables,
-        seqlen.input_lengths,
+        input_lengths,
         BLOCK_SIZE,
         max_s,
         None,
     )
     return out
+
+
+__all__ = [
+    "SUPPORTS_WINDOWING",
+    "attention",
+    "paged_attention",
+]
diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..68bbee30f6c9819ce8d9e2bff9122cde9fe9fb78
--- /dev/null
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@@ -0,0 +1,247 @@
+from typing import Tuple
+from dataclasses import dataclass, field
+
+from loguru import logger
+import torch
+
+from text_generation_server.layers.fp8 import fp8_quantize
+from text_generation_server.models.globals import ATTENTION, BLOCK_SIZE
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import Weights
+
+
+@dataclass
+class KVScales:
+    """
+    Key-value scales for FP8 KV cache.
+
+    This data class stores key and value scales both as a GPU tensor and
+    as a GPU float. This inconvenience is necessary because some functions
+    (e.g. scaling kernels) take scales as a GPU tensor, whereas others
+    (e.g. flashinfer) take scales as a CPU scalar.
+    """
+
+    key_scale: torch.Tensor
+    value_scale: torch.Tensor
+    key_scale_cpu: float = field(init=False)
+    value_scale_cpu: float = field(init=False)
+
+    def __post_init__(self):
+        if self.key_scale.numel() != 1 or self.value_scale.numel() != 1:
+            raise ValueError("Key and value scales must be scalar tensors.")
+
+        self.key_scale_cpu = self.key_scale.item()
+        self.value_scale_cpu = self.value_scale.item()
+
+
+class KVCache:
+    """
+    Key-value cache for attention layers.
+    """
+
+    kv_cache: Tuple[torch.Tensor, torch.Tensor]
+
+    def __init__(
+        self,
+        *,
+        num_blocks: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        """Construct the key-value cache for a layer."""
+
+        if dtype in {torch.float8_e5m2, torch.float8_e4m3fn} and (
+            ATTENTION != "flashinfer" or SYSTEM != "cuda"
+        ):
+            raise ValueError(
+                "FP8 KV cache is currently only supported for flashinfer on CUDA"
+            )
+
+        element_size = torch.tensor([], dtype=dtype).element_size()
+
+        if SYSTEM == "ipex" and device.type == "xpu":
+            x = 1
+        else:
+            x = BLOCK_SIZE // element_size
+
+        if ATTENTION in {"flashdecoding", "flashinfer"}:
+            self.kv_cache = (
+                torch.empty(
+                    (num_blocks, BLOCK_SIZE, num_heads, head_size),
+                    dtype=dtype,
+                    device=device,
+                ),
+                torch.empty(
+                    (num_blocks, BLOCK_SIZE, num_heads, head_size),
+                    dtype=dtype,
+                    device=device,
+                ),
+            )
+        elif SYSTEM == "ipex" and device == torch.device("cpu"):
+            self.kv_cache = (
+                torch.empty(
+                    (num_blocks, num_heads, BLOCK_SIZE, head_size),
+                    dtype=dtype,
+                    device=device,
+                ),
+                torch.empty(
+                    (num_blocks, num_heads, BLOCK_SIZE, head_size),
+                    dtype=dtype,
+                    device=device,
+                ),
+            )
+        else:
+            self.kv_cache = (
+                torch.zeros(
+                    (num_blocks, num_heads, head_size // x, BLOCK_SIZE, x),
+                    dtype=dtype,
+                    device=device,
+                ),
+                torch.zeros(
+                    (num_blocks, num_heads, head_size, BLOCK_SIZE),
+                    dtype=dtype,
+                    device=device,
+                ),
+            )
+
+    def can_scale(self, kv_scales: KVScales) -> bool:
+        """Check if the cache can be scaled by the given scales."""
+        if kv_scales.key_scale_cpu == 1.0 and kv_scales.value_scale_cpu == 1.0:
+            return False
+        elif (
+            self.dtype == torch.float8_e4m3fn
+            and ATTENTION == "flashinfer"
+            and SYSTEM == "cuda"
+        ):
+            log_once(
+                logger.info,
+                "Using FP8 KV cache scales",
+            )
+            return True
+        else:
+            # We have scales, but not the correct FP8 cache type, so warn once.
+            log_once(
+                logger.info,
+                "Ignoring FP8 KV cache scales, only float8_e4m3fn KV cache on flashinfer is supported",
+            )
+            return False
+
+    @property
+    def dtype(self):
+        """Get the data type of the cache."""
+        return self.kv_cache[0].dtype
+
+    @property
+    def key(self):
+        """Get the key cache."""
+
+        return self.kv_cache[0]
+
+    @property
+    def value(self):
+        """Get the value cache."""
+
+        return self.kv_cache[1]
+
+    def store(
+        self,
+        *,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        slots: torch.Tensor,
+        kv_scales: KVScales,
+    ):
+        """Store the key and value at the given slots."""
+
+        key_cache = self.kv_cache[0]
+        value_cache = self.kv_cache[1]
+
+        if self.can_scale(kv_scales):
+            if kv_scales.key_scale_cpu != 1.0:
+                key = fp8_quantize(
+                    key.float(),
+                    scale=kv_scales.key_scale,
+                    qdtype=self.dtype,
+                    scalar=True,
+                )[0]
+            if kv_scales.value_scale_cpu != 1.0:
+                value = fp8_quantize(
+                    value.float(),
+                    scale=kv_scales.value_scale,
+                    qdtype=self.dtype,
+                    scalar=True,
+                )[0]
+
+        if ATTENTION in {"flashdecoding", "flashinfer"}:
+            key = key.to(key_cache.dtype)
+            value = value.to(value_cache.dtype)
+            if key_cache.dtype in {torch.float8_e4m3fn, torch.float8_e5m2}:
+                # Torch index_put does not support float8_{e5m2,e4m3fn} yet, so
+                # put as raw data instead.
+                key_cache = key_cache.view(torch.uint8)
+                value_cache = value_cache.view(torch.uint8)
+                key = key.view(torch.uint8)
+                value = value.view(torch.uint8)
+            shape = key_cache.shape
+            key_cache.view(-1, shape[-2], shape[-1])[slots] = key
+            value_cache.view(-1, shape[-2], shape[-1])[slots] = value
+        else:
+            paged_reshape_and_cache(key, value, key_cache, value_cache, slots)
+
+
+def paged_reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+):
+    if SYSTEM == "cuda":
+        try:
+            from vllm._C import cache_ops
+        except Exception as e:
+            raise ImportError(
+                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+            )
+        cache_ops.reshape_and_cache(
+            key, value, key_cache, value_cache, slots, "auto", 1.0
+        )
+    elif SYSTEM == "rocm":
+        try:
+            import vllm._custom_ops as ops
+        except Exception as e:
+            raise ImportError(
+                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+            )
+        ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
+    elif SYSTEM == "ipex":
+        import intel_extension_for_pytorch as ipex
+
+        ipex.llm.modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache, slots
+        )
+    else:
+        raise NotImplementedError(
+            f"Cannot reshape and cache for paged attention, system '{SYSTEM}' not supported"
+        )
+
+
+def get_kv_scales(weights: Weights, prefix: str) -> KVScales:
+    """Load KV cache scales."""
+
+    key_scale = torch.tensor(1.0, dtype=torch.float32, device=weights.device)
+    value_scale = key_scale
+    if weights.has_tensor(f"{prefix}.k_scale") and weights.has_tensor(
+        f"{prefix}.v_scale"
+    ):
+        key_scale = weights.get_tensor(f"{prefix}.k_scale", to_dtype=False).float()
+        value_scale = weights.get_tensor(f"{prefix}.v_scale", to_dtype=False).float()
+    elif weights.has_tensor(f"{prefix}.kv_scale"):
+        # Fall back to older more coarse-grained scale when available.
+        key_scale = weights.get_tensor(f"{prefix}.kv_scale").float()
+        value_scale = key_scale
+
+    return KVScales(key_scale=key_scale, value_scale=value_scale)
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
index e665da0de3e038ac03818f6e97132ff11b407702..f38a53ed9293a9ba26b83c9eba754f02fede7353 100644
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -1,52 +1,45 @@
 import os
+from typing import Optional
 import torch
+from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
 from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.models.globals import FLASH_DECODING
 from text_generation_server.layers.attention import Seqlen
+from text_generation_server.utils.log import log_master
 from loguru import logger
 
 major, minor = torch.cuda.get_device_capability()
 is_sm75 = major == 7 and minor == 5
-_PARTITION_SIZE = 512
+
+_PARTITION_SIZE_V1V2 = 512
+_PARTITION_SIZE_CUSTOM = 256
 
 use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
 ENGINE = "triton" if use_triton else "ck"
 
+use_rocm_custom_paged_attn = os.getenv("ROCM_USE_CUSTOM_PAGED_ATTN", "1") != "0"
 try:
-    from vllm import _custom_ops
-except Exception as e:
-    raise ImportError(
-        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+    if use_rocm_custom_paged_attn:
+        # from vllm._custom_ops import paged_attention_custom
+        from vllm import _custom_ops
+except ImportError as e:
+    log_master(
+        logger.info,
+        f"Custom Paged Attention not available. Complete error: {e}",
     )
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    if FLASH_DECODING:
-        shape = key_cache.shape
-        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
-        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
-    else:
-        _custom_ops.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0
-        )
+    use_rocm_custom_paged_attn = False
 
 
 def paged_attention(
-    out: torch.Tensor,
     query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: KVCache,
     kv_head_mapping: torch.Tensor,
     softmax_scale: float,
     block_tables: torch.Tensor,
-    input_lengths: Seqlen,
+    seqlen: Seqlen,
     max_s: int,
+    *,
+    kv_scales: KVScales,
+    softcap: Optional[float] = None,
 ):
     # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
     # Copyright 2023 The vLLM team. All rights
@@ -65,25 +58,57 @@ def paged_attention(
     # limitations under the License.
     #
 
+    if softcap is not None:
+        raise RuntimeError("Paged attention doesn't support softcapping")
+
     # value_cache => [num_blocks, num_heads, head_size, block_size]
-    block_size = value_cache.shape[3]
+    block_size = kv_cache.value.shape[3]
     num_seqs, num_heads, head_size = query.shape
+
+    num_kv_heads = kv_cache.key.shape[1]
+
+    # gqa_ratio = num_heads // num_kv_heads
+    # use_custom = (
+    #     use_rocm_custom_paged_attn
+    #     and (query.dtype == torch.half or query.dtype == torch.bfloat16)
+    #     and (head_size == 128 or head_size == 64)
+    #     and (block_size == 16 or block_size == 32)
+    #     and (gqa_ratio >= 1 and gqa_ratio <= 16)
+    #     and max_s <= 32768
+    # )
+
+    # if not use_custom:
+    #     _PARTITION_SIZE = _PARTITION_SIZE_V1V2
+    # else:
+    #     _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM
+
+    _PARTITION_SIZE = 512
+
     max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-    input_lengths = input_lengths.input_lengths
+    input_lengths = seqlen.input_lengths + seqlen.cache_lengths
+
+    out = torch.empty_like(query)
 
     # NOTE(woosuk): We use a simple heuristic to decide whether to use
     # PagedAttention V1 or V2. If the number of partitions is 1, we use
     # V1 to avoid the overhead of reduction. Also, if the number of
     # sequences or heads is large, we use V1 since there is enough work
     # to parallelize.
-    use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)
+    # import vllm._custom_ops as ops
+    from vllm import _custom_ops
+
+    use_v1 = (
+        max_s <= 8192
+        and (max_num_partitions == 1 or num_seqs * num_heads > 512)
+    )
+
     if use_v1:
         _custom_ops.paged_attention_v1(
             out,
             query,
-            key_cache,
-            value_cache,
-            # kv_head_mapping.int(),
+            kv_cache.key,
+            kv_cache.value,
+            # kv_head_mapping,
             kv_head_mapping.shape[0],
             softmax_scale,
             block_tables,
@@ -109,14 +134,15 @@ def paged_attention(
         )
         max_logits = torch.empty_like(exp_sums)
 
+        # if not use_custom:
         _custom_ops.paged_attention_v2(
             out,
             exp_sums,
             max_logits,
             tmp_output,
             query,
-            key_cache,
-            value_cache,
+            kv_cache.key,
+            kv_cache.value,
             # kv_head_mapping,
             kv_head_mapping.shape[0],
             softmax_scale,
@@ -128,6 +154,26 @@ def paged_attention(
             "auto",
             1.0,
         )
+
+        # else:
+        #     paged_attention_custom(
+        #         out,
+        #         exp_sums,
+        #         max_logits,
+        #         tmp_output,
+        #         query,
+        #         kv_cache.key,
+        #         kv_cache.value,
+        #         num_kv_heads,
+        #         softmax_scale,
+        #         block_tables,
+        #         input_lengths,
+        #         block_size,
+        #         max_s,
+        #         None,
+        #         "auto",
+        #     )
+
     return out
 
 
@@ -135,7 +181,10 @@ if ENGINE != "triton":
     try:
         import flash_attn_2_cuda
 
-        logger.info("ROCm: using Flash Attention 2 Composable Kernel implementation.")
+        log_master(
+            logger.info,
+            "ROCm: using Flash Attention 2 Composable Kernel implementation.",
+        )
     except ImportError as e:
         if major >= 8:
             architecture_suffix = f"-{SYSTEM}"
@@ -151,7 +200,6 @@ if ENGINE != "triton":
                 "or install flash attention with `cd server && make install install-flash-attention`"
             ) from e
         else:
-
             for idx in range(torch.cuda.device_count()):
                 name = torch.cuda.get_device_name(idx)
                 if "MI210" not in name and "MI250" not in name:
@@ -164,67 +212,45 @@ if ENGINE != "triton":
 
 
 SUPPORTS_WINDOWING = False
-if ENGINE == "ck":
-
-    def attention(
-        q,
-        k,
-        v,
-        out,
-        cu_seqlens,
-        max_s,
-        softmax_scale,
-        window_size_left=-1,
-        causal=True,
-    ):
+
+
+def attention(
+    *,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: KVCache,
+    kv_scales: KVScales,
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    softmax_scale: float,
+    window_size_left: int = -1,
+    causal: bool = True,
+    softcap: Optional[float] = None,
+):
+    if ENGINE == "ck":
         if window_size_left <= 0 and window_size_left != -1:
             raise ValueError("`window_size_left` must be > 0 or -1")
 
+        out = torch.empty_like(query)
 
-# TypeError: varlen_fwd(): incompatible function arguments. The following argument types are supported:
-    # 1. (
-        # arg0: torch.Tensor, arg1: torch.Tensor, arg2: torch.Tensor, arg3: Optional[torch.Tensor], 
-        # arg4: torch.Tensor, arg5: torch.Tensor, arg6: Optional[torch.Tensor], arg7: Optional[torch.Tensor], 
-        # arg8: Optional[torch.Tensor], arg9: Optional[torch.Tensor], arg10: int, arg11: int, 
-        # arg12: float, arg13: float, arg14: bool, arg15: bool, 
-        # arg16: int, arg17: int, arg18: float, arg19: bool, 
-        # arg20: Optional[torch.Generator]) -> list[torch.Tensor]
-        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-        # with open("crash_ck_fa.log", "a") as f:
-        #     f.write()
-        # return flash_attn_2_cuda.varlen_fwd(
-        #     q,
-        #     k,
-        #     v,
-        #     out,
-        #     cu_seqlens,
-        #     cu_seqlens,
-        #     max_s,
-        #     max_s,
-        #     0.0,
-        #     softmax_scale,
-        #     False,
-        #     causal,
-        #     False,
-        #     None,
-        # )
-
-        # if softcap is None:
-        softcap = 0.0
+        if softcap is None:
+            softcap = 0.0
 
+        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
         return flash_attn_2_cuda.varlen_fwd(
-            q,
-            k,
-            v,
+            query,
+            key,
+            value,
             out,
-            cu_seqlens,
-            cu_seqlens,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_q,
             None,
             None,
             None,
             None,
-            max_s,
-            max_s,
+            seqlen.max_q,
+            seqlen.max_k,
             0.0,
             softmax_scale,
             False,
@@ -236,70 +262,35 @@ if ENGINE == "ck":
             None,
         )[0]
 
+    elif ENGINE == "triton":
+        from .flash_attn_triton import triton_attention
+
+        if softcap is not None:
+            raise NotImplementedError("softcap is only available with CK flash attn")
+
+        out = torch.empty_like(query)
 
-    # if ENGINE == "ck":
-    #     if window_size_left <= 0 and window_size_left != -1:
-    #         raise ValueError("`window_size_left` must be > 0 or -1")
-
-    #     out = torch.empty_like(query)
-
-    #     if softcap is None:
-    #         softcap = 0.0
-
-    #     # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-    #     return flash_attn_2_cuda.varlen_fwd(
-    #         q,
-    #         k,
-    #         v,
-    #         out,
-    #         cu_seqlen_q,
-    #         cu_seqlen_q,
-    #         None,
-    #         None,
-    #         None,
-    #         None,
-    #         max_s,
-    #         max_s,
-    #         0.0,
-    #         softmax_scale,
-    #         False,
-    #         causal,
-    #         window_size_left,
-    #         0,
-    #         softcap,
-    #         False,
-    #         None,
-    #     )[0]
-
-
-elif ENGINE == "triton":
-    from .flash_attn_triton import triton_attention
-
-    def attention(
-        q,
-        k,
-        v,
-        out,
-        cu_seqlens,
-        max_s,
-        softmax_scale,
-        window_size_left=-1,
-        causal=True,
-    ):
         # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
         output, _ = triton_attention(
-            q,
-            k,
-            v,
+            query,
+            key,
+            value,
             out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_q,
+            seqlen.max_q,
+            seqlen.max_k,
             causal,
             softmax_scale,
         )
         return output
 
-else:
-    raise RuntimeError(f"Unknown attention engine {ENGINE}")
+    else:
+        raise RuntimeError(f"Unknown attention engine {ENGINE}")
+
+
+__all__ = [
+    "SUPPORTS_WINDOWING",
+    "attention",
+    "paged_attention",
+]
diff --git a/server/text_generation_server/layers/awq/quantize/__init__.py b/server/text_generation_server/layers/awq/quantize/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e72881bb2b16bcd6894e6e60282d7065255d47d
--- /dev/null
+++ b/server/text_generation_server/layers/awq/quantize/__init__.py
@@ -0,0 +1,8 @@
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "ipex":
+    from .ipex import WQLinear
+elif SYSTEM == "cuda":
+    from .cuda import WQLinear
+
+__all__ = ["WQLinear"]
diff --git a/server/text_generation_server/layers/awq/quantize/qmodule.py b/server/text_generation_server/layers/awq/quantize/cuda.py
similarity index 99%
rename from server/text_generation_server/layers/awq/quantize/qmodule.py
rename to server/text_generation_server/layers/awq/quantize/cuda.py
index c859db1be6be437f70f8114c5ba28883d6660d2d..391371a553f59b315d6b3a71b24e83fd27fbbcf0 100644
--- a/server/text_generation_server/layers/awq/quantize/qmodule.py
+++ b/server/text_generation_server/layers/awq/quantize/cuda.py
@@ -1,6 +1,5 @@
 # Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py
 
-import math
 from typing import Optional
 import torch
 import torch.nn as nn
diff --git a/server/text_generation_server/layers/awq/quantize/ipex.py b/server/text_generation_server/layers/awq/quantize/ipex.py
new file mode 100644
index 0000000000000000000000000000000000000000..84cd7a2190dce444f61c13b8f6039ec453b499b6
--- /dev/null
+++ b/server/text_generation_server/layers/awq/quantize/ipex.py
@@ -0,0 +1,48 @@
+from typing import Optional
+import torch
+import torch.nn as nn
+import intel_extension_for_pytorch as ipex
+
+
+class WQLinear(nn.Module):
+    def __init__(
+        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
+    ):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError("Only 4-bit are supported for now.")
+
+        self.in_features = qweight.shape[0]
+        self.out_features = qweight.shape[1] * 32 // w_bit
+
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else self.in_features
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert self.out_features % (32 // self.w_bit) == 0
+
+        self.qweight = qweight
+        self.qzeros = qzeros
+        self.scales = scales
+        self.bias = bias
+        self.woq_linear = (
+            ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight(
+                self.qweight,
+                self.scales,
+                self.qzeros,
+                self.in_features,
+                self.out_features,
+                bias=self.bias,
+                group_size=self.group_size,
+                quant_method=ipex.llm.quantization.QuantMethod.AWQ_GEMM,
+                dtype=ipex.llm.quantization.QuantDtype.INT4,
+            )
+        )
+
+    @torch.no_grad()
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.out_features,)
+        out = self.woq_linear(x.reshape(-1, x.shape[-1]))
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/layers/bnb.py b/server/text_generation_server/layers/bnb.py
index ca39919ce671e4bcba3e79bdbda2fbd1cd017bac..791d9b6d8c6d306b4c5c06db2656c814b5a1e42b 100644
--- a/server/text_generation_server/layers/bnb.py
+++ b/server/text_generation_server/layers/bnb.py
@@ -1,15 +1,17 @@
-import torch
-from loguru import logger
-from functools import lru_cache
+from dataclasses import dataclass
+
 import bitsandbytes as bnb
+import torch
 from bitsandbytes.nn import Int8Params, Params4bit
+from text_generation_server.utils.weights import UnquantizedWeight
 
 
-@lru_cache(1)
-def warn_deprecate_bnb():
-    logger.warning(
-        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
-    )
+@dataclass
+class BNBWeight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear8bitLt(self.weight, bias, has_fp16_weights=False, threshold=6.0)
 
 
 class Linear8bitLt(torch.nn.Module):
@@ -70,6 +72,22 @@ class Linear8bitLt(torch.nn.Module):
         return out
 
 
+@dataclass
+class BNBFP4Weight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear4bit(self.weight, bias, quant_type="fp4")
+
+
+@dataclass
+class BNBNF4Weight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear4bit(self.weight, bias, quant_type="nf4")
+
+
 class Linear4bit(torch.nn.Module):
     def __init__(self, weight, bias, quant_type):
         super().__init__()
diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py
index fd22b5c679ebef41afe5f620c6acfed24ee6480a..b1e5235a030b541eef2ed47d56059471884ec0e9 100644
--- a/server/text_generation_server/layers/eetq.py
+++ b/server/text_generation_server/layers/eetq.py
@@ -1,5 +1,23 @@
+from dataclasses import dataclass
+
 import torch
 from EETQ import quant_weights, w8_a16_gemm
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+@dataclass
+class EETQWeight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        try:
+            from text_generation_server.layers.eetq import EETQLinear
+
+            return EETQLinear(self.weight, bias)
+        except ImportError:
+            raise ImportError(
+                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
+            )
 
 
 class EETQLinear(torch.nn.Module):
diff --git a/server/text_generation_server/layers/exl2.py b/server/text_generation_server/layers/exl2.py
index f6cb729ed6a443a9675d3b09b193d070bf59fd3c..a6e07f453433e1555156e473cbe9aa19c14cec84 100644
--- a/server/text_generation_server/layers/exl2.py
+++ b/server/text_generation_server/layers/exl2.py
@@ -1,9 +1,12 @@
-import torch
 from dataclasses import dataclass
+from typing import List, Union
+
+import torch
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
 
 
 @dataclass
-class Exl2Weight:
+class Exl2Weight(Weight):
     """
     Exllama2 exl2 quantized weights.
     """
@@ -21,3 +24,55 @@ class Exl2Weight:
     @property
     def device(self) -> torch.device:
         return self.q_weight.device
+
+    def get_linear(self, bias: torch.Tensor):
+        from text_generation_server.layers.gptq import ExllamaQuantLinear
+
+        return ExllamaQuantLinear(self, bias)
+
+
+class Exl2WeightsLoader(WeightsLoader):
+    """Loader for exl2-quantized weights."""
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply without tensor paralllism.
+        """
+        try:
+            q_weight = weights.get_tensor(f"{prefix}.q_weight")
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `exl2`-quantized weight, make sure the model is already quantized."
+            )
+
+        q_scale = weights.get_tensor(f"{prefix}.q_scale")
+        q_invperm = weights.get_tensor(f"{prefix}.q_invperm")
+        q_scale_max = weights.get_tensor(f"{prefix}.q_scale_max")
+        q_groups = weights.get_tensor(f"{prefix}.q_groups")
+
+        return Exl2Weight(
+            q_weight=q_weight,
+            q_scale=q_scale,
+            q_invperm=q_invperm,
+            q_scale_max=q_scale_max,
+            q_groups=q_groups,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        raise RuntimeError("Column-packed weights are not supported for exl")
+
+    def get_weights_col(self, weights: Weights, prefix: str):
+        # Sharding is not yet supported, so we return the weights as-is.
+        return self.get_weights(weights, prefix)
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        raise ValueError("get_multi_weights_col is not supported for exl2")
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        # Sharding is not yet supported, so we return the weights as-is.
+        return self.get_weights(weights, prefix)
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
index dd61d08190e73b4bd37ec2a04931ba8f9dfe4350..216881739e9cfc6119d70a53a4a71bb3e769591d 100644
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@@ -1,43 +1,438 @@
+from dataclasses import dataclass
+import os
+from typing import Optional, Tuple, Type, Union, List
+
 import torch
+from loguru import logger
+
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import (
+    Weight,
+    WeightsLoader,
+    UnquantizedWeight,
+    Weights,
+)
+from text_generation_server.utils.log import log_once
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+
+if SYSTEM == "cuda" and marlin_kernels is not None:
+    major, minor = torch.cuda.get_device_capability()
+    CUTLASS_FP8_AVAILABLE = marlin_kernels.cutlass_scaled_mm_supports_fp8(
+        major * 10 + minor
+    )
+else:
+    CUTLASS_FP8_AVAILABLE = False
+
+
+def get_fp8_linear() -> Type[torch.nn.Module]:
+    """
+    Return an FP8 linear `Module` that is compatible with the current system.
+    """
+
+    if SYSTEM == "cuda":
+
+        major, _ = torch.cuda.get_device_capability()
+        if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
+            # NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
+            #       gives better decoding throughput on L4 and L40.
+            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
+
+            return GPTQMarlinFP8Linear
+
+    # On other systems let Torch decide if the hardware supports FP8.
+    return Fp8Linear
+
+
+def normalize_e4m3fn_to_e4m3fnuz(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    assert weight.dtype == torch.float8_e4m3fn
+    # The bits pattern 10000000(-128) represents zero in e4m3fn
+    # but NaN in e4m3fnuz. So here we set it to 0.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_as_int8 = weight.view(torch.int8)
+    ROCM_FP8_NAN_AS_INT = -128
+    weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
+    weight = weight_as_int8.view(torch.float8_e4m3fnuz)
+
+    # For the same bits representation, e4m3fnuz value is half of
+    # the e4m3fn value, so we should double the scaling factor to
+    # get the same dequantized value.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_scale = weight_scale * 2.0
+    if input_scale is not None:
+        input_scale = input_scale * 2.0
+    return weight, weight_scale, input_scale
+
+
+def fp8_quantize(
+    weight: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    scale_upper_bound: Optional[torch.Tensor] = None,
+    qdtype: torch.dtype = torch.float8_e4m3fn,
+    scalar: bool = False,
+):
+    """
+    This function returns a reciprocal of the scale, so that a tensor can be unscaled
+    by multiplying it with the returned scale. If a scale is given through the `scale`
+    argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can
+    be used without modification).
+    """
+    if marlin_kernels is not None:
+        shape = weight.shape
+        qweight, scale = marlin_kernels.scaled_fp8_quant(
+            weight.reshape(-1, shape[-1]),
+            dtype=qdtype,
+            scale=scale,
+            scale_ub=scale_upper_bound,
+            # TODO: don't do this when we have to use the Torch kernel.
+            use_per_token_if_dynamic=not scalar,
+        )
 
+        return qweight.reshape(shape), scale
 
-def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
-    device = weight.device
-    # weight, scale = quant_weights(weight, torch.int8, False)
     finfo = torch.finfo(qdtype)
-    # Calculate the scale as dtype max divided by absmax
-    scale = finfo.max / weight.abs().max().clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
+
+    if scale is None:
+        # Calculate the scale as dtype max divided by absmax
+        scale = finfo.max / weight.abs().max().clamp(min=1e-12, max=scale_upper_bound)
+        # scale and clamp the tensor to bring it to
+        # the representative range of float8 data type
+        # (as default cast is unsaturated)
+        qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
+        scale = scale.float().reciprocal()
+    else:
+        # Use reciprocal to avoid more expensive division.
+        qweight = (weight * scale.reciprocal()).clamp(min=finfo.min, max=finfo.max)
+
     # Return both float8 data and the inverse scale (as float),
     # as both required as inputs to torch._scaled_mm
     qweight = qweight.to(qdtype)
-    scale = scale.float().reciprocal()
+
+    if SYSTEM == "rocm":
+        qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz(qweight, scale)
+
     return qweight, scale
 
 
+class HybridFP8UnquantLoader(WeightsLoader):
+    """Weight loader that loads FP8 and unquantized Torch tensors."""
+
+    def __init__(self, activation_scale_ub: Optional[float], to_fp8: bool):
+        self.activation_scale_ub = activation_scale_ub
+        self.to_fp8 = to_fp8
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        w = weights.get_tensor(f"{prefix}.weight")
+
+        if w.dtype == torch.float8_e4m3fn:
+            # FP8 branch
+            scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = weights.get_tensor(
+                    f"{prefix}.input_scale", to_dtype=False
+                ).reshape(-1)
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        w = weights.get_packed_sharded(
+            f"{prefix}.weight", dim=0, block_sizes=block_sizes
+        )
+
+        if w.dtype == torch.float8_e4m3fn:
+            # FP8 branch
+            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+            if scale.numel() > 1:
+                scale = weights.get_packed_sharded(
+                    f"{prefix}.weight_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            scale = scale.reshape(-1).expand(w.shape[0])
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = weights.get_tensor(
+                    f"{prefix}.input_scale", to_dtype=False
+                )
+                if input_scale.numel() > 1:
+                    input_scale = weights.get_packed_sharded(
+                        f"{prefix}.input_scale",
+                        dim=0,
+                        block_sizes=block_sizes,
+                        to_dtype=False,
+                    )
+                input_scale = input_scale.reshape(-1).max()
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
+        w = [
+            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
+        ]
+        shapes = [x.shape for x in w]
+
+        # Concat then send to the device
+        w = torch.cat(w, dim=dim).to(weights.device)
+
+        # FP8 branch
+        if w.dtype == torch.float8_e4m3fn:
+            scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+            ]
+            scale = torch.cat(scale, dim=0).reshape(-1)
+
+            input_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        w = weights.get_sharded(f"{prefix}.weight", dim=1)
+        # FP8 branch
+        if w.dtype == torch.float8_e4m3fn:
+            scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = weights.get_tensor(
+                    f"{prefix}.input_scale", to_dtype=False
+                ).reshape(-1)
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+
+@dataclass
+class Fp8Weight(Weight):
+    weight: torch.Tensor
+    dtype: torch.dtype
+    weight_scale: Optional[torch.Tensor] = None
+    input_scale: Optional[torch.Tensor] = None
+    activation_scale_ub: Optional[float] = None
+
+    def get_linear(self, bias: torch.Tensor):
+        if self.weight_scale is None:
+            return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
+        # This is not checked by the fbgemm kernels, but they require contiguous
+        # memory. Can be non-contiguous when we e.g. expand from scalars.
+        self.weight_scale = self.weight_scale.contiguous()
+        return get_fp8_linear().from_fp8(
+            weight=self.weight,
+            scale=self.weight_scale,
+            dtype=self.dtype,
+            bias=bias,
+            input_scale=self.input_scale,
+            scale_upper_bound=self.activation_scale_ub,
+        )
+
+
 class Fp8Linear(torch.nn.Module):
+    _device_identity_cache = {}
+
     def __init__(
         self,
-        weight,
-        bias,
+        qweight: torch.Tensor,
+        scale: torch.Tensor,
+        dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        scale_upper_bound: Optional[float] = None,
     ) -> None:
         super().__init__()
-        self.dtype = weight.dtype
-        self.qweight, self.scale = fp8_quantize(weight)
+        if CUTLASS_FP8_AVAILABLE:
+            log_once(logger.info, "Using cutlass w8a8 kernels")
+        if SYSTEM == "rocm" and qweight.dtype == torch.float8_e4m3fn:
+            qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                weight=qweight, weight_scale=scale
+            )
+
+        self.dtype = dtype
+        self.qweight = qweight
+        self.scale = scale.float()
+        self.input_scale = input_scale.float() if input_scale is not None else None
+
+        if CUTLASS_FP8_AVAILABLE and scale_upper_bound is not None:
+            self.scale_upper_bound = torch.tensor(
+                scale_upper_bound, dtype=torch.float32, device=qweight.device
+            )
+        else:
+            self.scale_upper_bound = scale_upper_bound
 
         self.bias = bias if bias is not None else None
 
+    @classmethod
+    def from_unquant(cls, weight, bias, dtype):
+        qweight, scale = fp8_quantize(weight, scalar=not CUTLASS_FP8_AVAILABLE)
+        return cls(
+            qweight=qweight,
+            scale=scale,
+            dtype=dtype,
+            bias=bias,
+            input_scale=None,
+            scale_upper_bound=None,
+        )
+
+    @classmethod
+    def from_fp8(
+        cls,
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> "Fp8Linear":
+        input_scale = kwargs.get("input_scale", None)
+        scale_upper_bound = kwargs.get("scale_upper_bound", None)
+
+        return cls(
+            qweight=weight,
+            scale=scale,
+            input_scale=input_scale,
+            scale_upper_bound=scale_upper_bound,
+            bias=bias,
+            dtype=dtype,
+        )
+
+    @classmethod
+    def get_shared_device_identity(cls, device):
+        # Input scaling factors are no longer optional in _scaled_mm starting
+        # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
+        if device not in cls._device_identity_cache:
+            cls._device_identity_cache[device] = torch.ones(1, device=device)
+        return cls._device_identity_cache[device]
+
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        qinput, scale = fp8_quantize(input)
-        output, _ = torch._scaled_mm(
-            qinput,
-            self.qweight.t(),
-            out_dtype=self.dtype,
-            scale_a=scale,
-            scale_b=self.scale,
-            bias=self.bias,
+        if CUTLASS_FP8_AVAILABLE:
+            # cutlass FP8 supports per-token scales, so get non-scalar scales.
+            qinput, scale = fp8_quantize(
+                input, scale_upper_bound=self.scale_upper_bound, scalar=False
+            )
+            return marlin_kernels.cutlass_scaled_mm(
+                qinput, self.qweight.t(), scale, self.scale, input.dtype, self.bias
+            )
+
+        qinput, scale = fp8_quantize(
+            input,
+            self.input_scale,
+            scale_upper_bound=self.scale_upper_bound,
+            scalar=True,
         )
+
+        per_tensor_weights = self.scale.numel() == 1
+        per_tensor_activations = scale.numel() == 1
+
+        if SYSTEM != "rocm" or (per_tensor_weights and per_tensor_activations):
+            output = torch._scaled_mm(
+                qinput,
+                self.qweight.t(),
+                out_dtype=self.dtype,
+                scale_a=scale,
+                scale_b=self.scale,
+                bias=self.bias,
+            )
+
+            if isinstance(output, tuple) and len(output) == 2:
+                output = output[0]
+        else:
+            device_identity = None
+            if SYSTEM == "rocm":
+                device_identity = self.get_shared_device_identity(self.qweight.device)
+
+            output = torch._scaled_mm(
+                qinput,
+                self.qweight.t(),
+                scale_a=device_identity,
+                scale_b=device_identity,
+                out_dtype=torch.float32,
+            )
+            if isinstance(output, tuple) and len(output) == 2:
+                output = output[0]
+
+            output = output * scale * self.scale.t()
+            if self.bias is not None:
+                output = output + self.bias
+
+            output = output.to(dtype=self.dtype)
+
         return output
+
+
+def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: torch.Size):
+    scale = weights.get_tensor(prefix, to_dtype=False)
+    if scale.numel() > 1:
+        scale = weights.get_sharded(prefix, dim=0, to_dtype=False)
+    return scale.reshape(-1).expand(shape[0])
diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py
index 5608014502804f1e7fc41cf2d082c388ab665fdc..7e8380352140528174f8461061677d100974d6b4 100644
--- a/server/text_generation_server/layers/gptq/__init__.py
+++ b/server/text_generation_server/layers/gptq/__init__.py
@@ -1,30 +1,28 @@
-from dataclasses import dataclass
 import os
-from typing import Optional
-import torch
-from text_generation_server.utils.import_utils import (
-    SYSTEM,
-)
+from dataclasses import dataclass
+from typing import List, Optional, Union
 
+import torch
+from loguru import logger
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
 
-@dataclass
-class GPTQParams:
-    bits: int
-    checkpoint_format: Optional[str]
-    groupsize: int
-    desc_act: bool
-    quant_method: str
-    sym: bool
+if SYSTEM == "ipex":
+    from .ipex import QuantLinear
+elif SYSTEM in {"cuda", "rocm"}:
+    from .triton import QuantLinear
 
 
 @dataclass
-class GPTQWeight:
+class GPTQWeight(Weight):
     qweight: torch.Tensor
     qzeros: torch.Tensor
     scales: torch.Tensor
     g_idx: Optional[torch.Tensor]
     bits: int
     groupsize: int
+    use_awq_kernel: bool
     use_exllama: bool
 
     def __post_init__(self):
@@ -35,7 +33,386 @@ class GPTQWeight:
     def device(self) -> torch.device:
         return self.qweight.device
 
+    def get_linear(self, bias: torch.Tensor):
+        if self.use_awq_kernel:
+            if SYSTEM == "rocm":
+                raise NotImplementedError(
+                    "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
+                    "to use Exllama/GPTQ kernels for AWQ inference."
+                )
+            try:
+                from text_generation_server.layers.awq.quantize import WQLinear
+
+                return WQLinear(
+                    w_bit=self.bits,
+                    group_size=self.groupsize,
+                    qweight=self.qweight,
+                    qzeros=self.qzeros,
+                    scales=self.scales,
+                    bias=bias,
+                )
+            except ImportError:
+                raise NotImplementedError(
+                    "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
+                )
+        elif self.use_exllama:
+            try:
+                from text_generation_server.layers.gptq import ExllamaQuantLinear
+            except ImportError:
+                raise NotImplementedError(
+                    "Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
+                )
+
+            return ExllamaQuantLinear(self, bias)
+        else:
+            return QuantLinear(
+                self.qweight,
+                self.qzeros,
+                self.scales,
+                self.g_idx,
+                bias,
+                self.bits,
+                self.groupsize,
+            )
+
+
+class GPTQWeightsLoader(WeightsLoader):
+    """
+    Loader for GPTQ- and AWQ-quantized weights.
+    """
+
+    def __init__(
+        self,
+        *,
+        bits: int,
+        desc_act: bool,
+        groupsize: int,
+        quant_method: str,
+        quantize: str,
+        sym: bool,
+    ):
+        self.bits = bits
+        self.desc_act = desc_act
+        self.groupsize = groupsize
+        self.quant_method = quant_method
+        self.quantize = quantize
+        self.sym = sym
+
+    def get_weights(self, weights: Weights, prefix: str):
+        self._get_gptq_params(weights)
+
+        use_exllama = True
+        if self.bits != 4:
+            use_exllama = False
+
+        if self.desc_act:
+            log_once(logger.warning, "Disabling exllama because desc_act=True")
+            use_exllama = False
+
+        try:
+            qweight = weights.get_tensor(f"{prefix}.qweight")
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+            )
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        else:
+            g_idx = None
+
+        from text_generation_server.layers.gptq import (
+            HAS_EXLLAMA,
+            CAN_EXLLAMA,
+            GPTQWeight,
+        )
+
+        if use_exllama:
+            if not HAS_EXLLAMA:
+                if CAN_EXLLAMA:
+                    log_once(
+                        logger.warning,
+                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
+                    )
+                use_exllama = False
+            else:
+                log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
+
+        qzeros = weights.get_tensor(f"{prefix}.qzeros")
+        scales = weights.get_tensor(f"{prefix}.scales")
+
+        if use_exllama and g_idx is not None:
+            g_idx = g_idx - g_idx[0]
+
+        if self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
 
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_exllama=use_exllama,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        try:
+            qweight = weights.get_packed_sharded(
+                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
+            )
+        scales = weights.get_packed_sharded(
+            f"{prefix}.scales", dim=1, block_sizes=block_sizes
+        )
+        scales = scales.to(dtype=weights.dtype)
+
+        self._get_gptq_params(weights)
+
+        qzeros = weights.get_packed_sharded(
+            f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
+        )
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        elif self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            g_idx = (
+                torch.arange(
+                    qweight.shape[0] * (32 // self.bits),
+                    device=qweight.device,
+                )
+                // self.groupsize
+            ).to(dtype=torch.int32)
+        else:
+            g_idx = None
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=False,
+        )
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        try:
+            qweight = torch.cat(
+                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat(
+            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
+        )
+
+        self._get_gptq_params(weights)
+
+        qzeros = torch.cat(
+            [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
+        )
+
+        from text_generation_server.layers.gptq import HAS_EXLLAMA
+
+        use_exllama = (
+            self.bits == 4
+            and HAS_EXLLAMA
+            and self.quantize == "gptq"
+            and not self.desc_act
+        )
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+        elif self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
+        else:
+            g_idx = None
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=use_exllama,
+        )
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        self._get_gptq_params(weights)
+
+        use_exllama = True
+        desc_act = self.desc_act
+        if self.bits != 4:
+            use_exllama = False
+
+        if self.desc_act:
+            log_once(logger.warning, "Disabling exllama because desc_act=True")
+            use_exllama = False
+
+        try:
+            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+            )
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
+        else:
+            g_idx = None
+
+        if weights.process_group.size() > 1:
+            if g_idx is not None:
+                if (
+                    not torch.equal(
+                        # Remove g_idx[0] to adapt the check with TP>1.
+                        (g_idx - g_idx[0]).cpu(),
+                        torch.tensor(
+                            [i // self.groupsize for i in range(g_idx.shape[0])],
+                            dtype=torch.int32,
+                        ),
+                    )
+                    and not (g_idx == 0).all()
+                ):
+                    # Exllama implementation does not support row tensor parallelism with act-order, as
+                    # it would require to reorder input activations that are split unto several GPUs
+                    use_exllama = False
+                    desc_act = True
+
+        from text_generation_server.layers.gptq import (
+            CAN_EXLLAMA,
+            HAS_EXLLAMA,
+            GPTQWeight,
+        )
+
+        if use_exllama:
+            if not HAS_EXLLAMA:
+                if CAN_EXLLAMA:
+                    log_once(
+                        logger.warning,
+                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
+                    )
+                use_exllama = False
+            else:
+                log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
+
+        if not desc_act and self.groupsize != -1:
+            qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
+            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
+            if g_idx is not None:
+                # qzeros, scales sharded, and g_idx must be adjusted accordingly
+                g_idx = g_idx - g_idx[0]
+        else:
+            qzeros = weights.get_tensor(f"{prefix}.qzeros")
+            scales = weights.get_tensor(f"{prefix}.scales")
+
+        if self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=use_exllama,
+        )
+
+    def _get_gptq_params(self, weights: Weights):
+        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
+            self.bits = weights.get_tensor("gptq_bits").item()
+            self.groupsize = weights.get_tensor("gptq_groupsize").item()
+            self.desc_act = False
+            # `server quantize` used asymmetric quantization unconditionally
+            # before the `gptq_sym` setting tensor was added.
+            self.sym = (
+                weights.get_tensor("gptq_sym").item()
+                if weights.has_tensor("gptq_sym")
+                else False
+            )
+            self.quant_method = "gptq"
+
+
+# Needs to be at the end because circular import.
 try:
     major, _minor = torch.cuda.get_device_capability()
 except Exception:
@@ -50,22 +427,20 @@ elif CAN_EXLLAMA:
     try:
         if V2:
             from text_generation_server.layers.gptq.exllamav2 import (
-                QuantLinear as ExllamaQuantLinear,
-                create_exllama_buffers,
-                set_device,
+                QuantLinear as ExllamaQuantLinear,  # noqa: F401
+                create_exllama_buffers,  # noqa: F401
+                set_device,  # noqa: F401
             )
 
             HAS_EXLLAMA = "2"
         else:
             from text_generation_server.layers.gptq.exllama import (
-                Ex4bitLinear as ExllamaQuantLinear,
-                create_exllama_buffers,
-                set_device,
+                Ex4bitLinear as ExllamaQuantLinear,  # noqa: F401
+                create_exllama_buffers,  # noqa: F401
+                set_device,  # noqa: F401
             )
 
             HAS_EXLLAMA = "1"
 
     except ImportError:
         pass
-
-from text_generation_server.layers.gptq.quant_linear import QuantLinear
diff --git a/server/text_generation_server/layers/gptq/custom_autotune.py b/server/text_generation_server/layers/gptq/custom_autotune.py
index 1eb40f1eddcc600773bdd80b152ab355fb72a43d..0388ef20b208ae93856dd33da1d473debdf2649d 100644
--- a/server/text_generation_server/layers/gptq/custom_autotune.py
+++ b/server/text_generation_server/layers/gptq/custom_autotune.py
@@ -91,7 +91,7 @@ class Autotuner(triton.KernelInterface):
                 kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
             )
         except triton.OutOfResources:
-            return (float("inf"), float("inf"), float("inf"))
+            return [float("inf"), float("inf"), float("inf")]
 
     def run(self, *args, **kwargs):
         self.nargs = dict(zip(self.arg_names, args))
diff --git a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py
index e58a926ff477aabbab0fa9e460d717711cf10f00..920a6adf4b13d81857ceb56c2c0b7f587dc15532 100644
--- a/server/text_generation_server/layers/gptq/exllamav2.py
+++ b/server/text_generation_server/layers/gptq/exllamav2.py
@@ -1,253 +1,267 @@
-# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
-
-from dataclasses import dataclass
-from typing import Optional
-import torch
-import torch.nn as nn
-
-from loguru import logger
-
-from text_generation_server.layers.exl2 import Exl2Weight
-from text_generation_server.layers.gptq import GPTQWeight
-
-try:
-    from exllamav2_kernels import make_q_matrix, gemm_half_q_half
-except ImportError:
-    logger.error("exllamav2_kernels not installed.")
-    raise
-
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-none_tensor = torch.empty((1, 1), device="meta")
-
-
-@dataclass
-class _ExtraTensors:
-    """Additional generated quantizer tensors."""
-
-    q_group_map: Optional[torch.Tensor] = None
-    q_invperm: Optional[torch.Tensor] = None
-    q_perm: Optional[torch.Tensor] = None
-
-
-def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
-    """Matrix multiplication, returns x @ q4"""
-    output_shape = x.shape[:-1] + (q4_width,)
-    x = x.view(-1, x.shape[-1])
-    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
-    gemm_half_q_half(x, q_handle, output, force_cuda)
-    return output.view(output_shape)
-
-
-def make_group_map(q_groups: torch.Tensor, num_qrows: int):
-    gr = q_groups.tolist()
-    group_map = []
-    num_groups = len(gr) // 2
-
-    for i in range(num_groups):
-        bits = gr[i * 2]
-        if i < num_groups - 1:
-            qrows = gr[i * 2 + 3] - gr[i * 2 + 1]
-        else:
-            qrows = num_qrows - gr[i * 2 + 1]
-        rows = qrows * 32 // bits
-        for j in range(rows):
-            group_map += [i]
-            group_map += [rows - j]
-
-    return torch.tensor(group_map, dtype=torch.short, device=q_groups.device)
-
-
-# Create Q matrix
-
-
-def ext_make_q_matrix(
-    w: Exl2Weight | GPTQWeight,
-    extra: _ExtraTensors,
-    temp_dq,
-    key: Optional[str] = None,
-):
-    """
-    Create Q matrix
-    """
-    # EXL2
-    if isinstance(w, Exl2Weight):
-        extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
-        extra.q_perm = torch.argsort(w.q_invperm).short()
-
-        return make_q_matrix(
-            w.q_weight,
-            extra.q_perm,
-            w.q_invperm,
-            w.q_scale,
-            w.q_scale_max,
-            w.q_groups,
-            extra.q_group_map,
-            none_tensor,
-            none_tensor,
-            none_tensor,
-            temp_dq,
-        )
-    # GPTQ
-    elif isinstance(w, GPTQWeight):
-        if w.scales.dtype == torch.float:
-            w.scales = w.scales.half()
-
-        # GPTQ with g_idx (act_order)
-        if w.g_idx is not None and not (w.g_idx == 0).all().item():
-            extra.q_perm = torch.empty(
-                (w.qweight.shape[0] * 8,),
-                dtype=torch.short,
-                device=w.qweight.device,
-            )
-            extra.q_invperm = torch.empty_like(extra.q_perm)
-            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
-            return make_q_matrix(
-                w.qweight,
-                extra.q_perm,
-                extra.q_invperm,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                w.qzeros,
-                w.scales,
-                w.g_idx.cpu(),
-                temp_dq,
-            )
-        # GPTQ without g_idx
-        else:
-            return make_q_matrix(
-                w.qweight,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                w.qzeros,
-                w.scales,
-                none_tensor,
-                temp_dq,
-            )
-    else:
-        RuntimeError("Cannot create handle")
-
-
-DEVICE = None
-LAYERS = []
-
-
-def set_device(device):
-    global DEVICE
-    DEVICE = device
-
-
-def create_exllama_buffers(max_total_tokens: int):
-    global LAYERS, DEVICE
-
-    # No need to initialize scratch space if there are no layers
-    # that use ExLLamav2.
-    if len(LAYERS) == 0:
-        return
-
-    # Find the size of the scratch space.
-    scratch_bytes = max(
-        layer.scratch_space_fixed(max_input_len=max_total_tokens, max_batch_size=1)
-        for layer in LAYERS
-    )
-    temp_dq = ExLlamaV2DeviceTensors(DEVICE, scratch_bytes)
-
-    for layer in LAYERS:
-        layer.post_init(temp_dq)
-
-
-class QuantLinear(nn.Module):
-    QUANT_TYPE = "exllamav2"
-
-    """Linear layer implementation with per-group 4-bit quantization of the weights"""
-
-    def __init__(
-        self,
-        weight: Exl2Weight | GPTQWeight,
-        bias: torch.Tensor,
-    ):
-        super().__init__()
-
-        self.q_handle = None
-        self.q_tensors = weight
-        self.extra_tensors = _ExtraTensors()
-
-        if isinstance(weight, Exl2Weight):
-            self.infeatures = weight.q_invperm.shape[0]
-            self.outfeatures = weight.q_weight.shape[1]
-        elif isinstance(weight, GPTQWeight):
-            if weight.bits != 4:
-                raise ValueError(
-                    f"Exllamav2 kernel supports only bits=4, requested bits={weight.bits}. Something is wrong in the model initialization."
-                )
-
-            self.infeatures = weight.qweight.shape[0] // weight.bits * 32
-            self.outfeatures = weight.qweight.shape[1]
-
-        self.padding = -self.outfeatures % 32
-        self.outfeatures = self.outfeatures + self.padding
-
-        self.device = weight.device
-        self.bias = bias if bias is not None else None
-
-        global LAYERS
-        LAYERS.append(self)
-
-    def post_init(self, temp_dq):
-        device = self.q_tensors.device
-        assert device.type == "cuda"
-        assert device.index is not None
-        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
-
-        # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
-        # and `Memory access fault by GPU node-2` will EAT you.
-        self.temp_dq = temp_dq
-        self.q_handle = ext_make_q_matrix(self.q_tensors, self.extra_tensors, temp_dq)
-
-    def forward(self, x, force_cuda=False):
-        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
-
-        if self.bias is not None:
-            output.add_(self.bias)
-        return output
-
-    def temp_dq_size(self):
-        return self.infeatures * self.outfeatures * 2 + 128
-
-    def temp_fwd_size(self, max_input_len, max_batch_size):
-        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
-
-    def scratch_space_fixed(self, max_input_len, max_batch_size):
-        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
-
-
-class ExLlamaV2DeviceTensors:
-
-    device_idx: int
-    scratch_bytes: int
-    scratch_idx: int
-    scratch: torch.tensor = None
-
-    def __init__(self, device, scratch_bytes):
-        self.device = device
-        self.scratch_bytes = scratch_bytes
-
-    def prepare(self):
-        self.scratch = torch.empty(
-            (self.scratch_bytes // 2,), dtype=torch.half, device=self.device
-        )
-
-    def get_scratch_slice(self, size_bytes):
-
-        if self.scratch is None:
-            self.prepare()
-
-        size_bytes = ((size_bytes + 127) // 128) * 128
-        size_half = size_bytes // 2
-        scratch_slice = self.scratch.narrow(0, 0, size_half)
-        return scratch_slice
+# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
+
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+
+from loguru import logger
+
+from text_generation_server.layers.exl2 import Exl2Weight
+from text_generation_server.layers.gptq import GPTQWeight
+from text_generation_server.utils.log import log_master
+
+try:
+    from exllamav2.ext import exllamav2_ext
+
+    make_q_matrix = exllamav2_ext.make_q_matrix
+    gemm_half_q_half = exllamav2_ext.gemm_half_q_half
+except ImportError:
+    log_master(logger.warning, "exllamav2_kernels not installed.")
+    raise
+
+# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
+none_tensor = torch.empty((1, 1), device="meta")
+
+
+@dataclass
+class _ExtraTensors:
+    """Additional generated quantizer tensors."""
+
+    q_group_map: Optional[torch.Tensor] = None
+    q_invperm: Optional[torch.Tensor] = None
+    q_perm: Optional[torch.Tensor] = None
+
+
+def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
+    """Matrix multiplication, returns x @ q4"""
+    output_shape = x.shape[:-1] + (q4_width,)
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
+    gemm_half_q_half(x, q_handle, output, force_cuda)
+    return output.view(output_shape)
+
+
+def make_group_map(q_groups: torch.Tensor, num_qrows: int):
+    gr = q_groups.tolist()
+    group_map = []
+    num_groups = len(gr) // 2
+
+    for i in range(num_groups):
+        bits = gr[i * 2]
+        if i < num_groups - 1:
+            qrows = gr[i * 2 + 3] - gr[i * 2 + 1]
+        else:
+            qrows = num_qrows - gr[i * 2 + 1]
+        rows = qrows * 32 // bits
+        for j in range(rows):
+            group_map += [i]
+            group_map += [rows - j]
+
+    return torch.tensor(group_map, dtype=torch.short, device=q_groups.device)
+
+
+# Create Q matrix
+
+
+def ext_make_q_matrix(
+    w: Exl2Weight | GPTQWeight,
+    extra: _ExtraTensors,
+    temp_dq,
+    key: Optional[str] = None,
+):
+    """
+    Create Q matrix
+    """
+    # max_dq_size = 512*(1024**2)
+    # max_dq_rows = max_dq_size // out_features[0]
+    max_dq_rows = 0
+
+    # EXL2
+    if isinstance(w, Exl2Weight):
+        extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
+        extra.q_perm = torch.argsort(w.q_invperm).short()
+
+        return make_q_matrix(
+            w.q_weight,
+            extra.q_perm,
+            w.q_invperm,
+            w.q_scale,
+            w.q_scale_max,
+            w.q_groups,
+            extra.q_group_map,
+            none_tensor,  # zeros
+            none_tensor,  # scales
+            none_tensor,  # g_idx
+            none_tensor,  # bias
+            temp_dq,
+            max_dq_rows,
+        )
+    # GPTQ
+    elif isinstance(w, GPTQWeight):
+        if w.scales.dtype == torch.float:
+            w.scales = w.scales.half()
+
+        # GPTQ with g_idx (act_order)
+        if w.g_idx is not None and not (w.g_idx == 0).all().item():
+            extra.q_perm = torch.empty(
+                (w.qweight.shape[0] * 8,),
+                dtype=torch.short,
+                device=w.qweight.device,
+            )
+            extra.q_invperm = torch.empty_like(extra.q_perm)
+            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
+            return make_q_matrix(
+                w.qweight,
+                extra.q_perm,
+                extra.q_invperm,
+                none_tensor,  # q_scale
+                none_tensor,  # q_scale_max
+                none_tensor,  # q_groups
+                none_tensor,  # q_group_map
+                w.qzeros,
+                w.scales,
+                w.g_idx.cpu(),
+                none_tensor,  # bias
+                temp_dq,
+                max_dq_rows,
+            )
+        # GPTQ without g_idx
+        else:
+            return make_q_matrix(
+                w.qweight,
+                none_tensor,  # q_perm
+                none_tensor,  # q_invperm
+                none_tensor,  # q_scale
+                none_tensor,  # q_scale_max
+                none_tensor,  # q_groups
+                none_tensor,  # q_group_map
+                w.qzeros,
+                w.scales,
+                none_tensor,  # g_idx
+                none_tensor,  # bias
+                temp_dq,
+                max_dq_rows,
+            )
+    else:
+        RuntimeError("Cannot create handle")
+
+
+DEVICE = None
+LAYERS = []
+
+
+def set_device(device):
+    global DEVICE
+    DEVICE = device
+
+
+def create_exllama_buffers(max_total_tokens: int):
+    global LAYERS, DEVICE
+
+    # No need to initialize scratch space if there are no layers
+    # that use ExLLamav2.
+    if len(LAYERS) == 0:
+        return
+
+    # Find the size of the scratch space.
+    scratch_bytes = max(
+        layer.scratch_space_fixed(max_input_len=max_total_tokens, max_batch_size=1)
+        for layer in LAYERS
+    )
+    temp_dq = ExLlamaV2DeviceTensors(DEVICE, scratch_bytes)
+
+    for layer in LAYERS:
+        layer.post_init(temp_dq)
+
+
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "exllamav2"
+
+    """Linear layer implementation with per-group 4-bit quantization of the weights"""
+
+    def __init__(
+        self,
+        weight: Exl2Weight | GPTQWeight,
+        bias: torch.Tensor,
+    ):
+        super().__init__()
+
+        self.q_handle = None
+        self.q_tensors = weight
+        self.extra_tensors = _ExtraTensors()
+
+        if isinstance(weight, Exl2Weight):
+            self.infeatures = weight.q_invperm.shape[0]
+            self.outfeatures = weight.q_weight.shape[1]
+        elif isinstance(weight, GPTQWeight):
+            if weight.bits != 4:
+                raise ValueError(
+                    f"Exllamav2 kernel supports only bits=4, requested bits={weight.bits}. Something is wrong in the model initialization."
+                )
+
+            self.infeatures = weight.qweight.shape[0] // weight.bits * 32
+            self.outfeatures = weight.qweight.shape[1]
+
+        self.padding = -self.outfeatures % 32
+        self.outfeatures = self.outfeatures + self.padding
+
+        self.device = weight.device
+        self.bias = bias if bias is not None else None
+
+        global LAYERS
+        LAYERS.append(self)
+
+    def post_init(self, temp_dq):
+        device = self.q_tensors.device
+        assert device.type == "cuda"
+        assert device.index is not None
+        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
+
+        # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
+        # and `Memory access fault by GPU node-2` will EAT you.
+        self.temp_dq = temp_dq
+        self.q_handle = ext_make_q_matrix(self.q_tensors, self.extra_tensors, temp_dq)
+
+    def forward(self, x, force_cuda=False):
+        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
+
+        if self.bias is not None:
+            output.add_(self.bias)
+        return output
+
+    def temp_dq_size(self):
+        return self.infeatures * self.outfeatures * 2 + 128
+
+    def temp_fwd_size(self, max_input_len, max_batch_size):
+        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
+
+    def scratch_space_fixed(self, max_input_len, max_batch_size):
+        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
+
+
+class ExLlamaV2DeviceTensors:
+
+    device_idx: int
+    scratch_bytes: int
+    scratch_idx: int
+    scratch: torch.tensor = None
+
+    def __init__(self, device, scratch_bytes):
+        self.device = device
+        self.scratch_bytes = scratch_bytes
+
+    def prepare(self):
+        self.scratch = torch.empty(
+            (self.scratch_bytes // 2,), dtype=torch.half, device=self.device
+        )
+
+    def get_scratch_slice(self, size_bytes):
+
+        if self.scratch is None:
+            self.prepare()
+
+        size_bytes = ((size_bytes + 127) // 128) * 128
+        size_half = size_bytes // 2
+        scratch_slice = self.scratch.narrow(0, 0, size_half)
+        return scratch_slice
diff --git a/server/text_generation_server/layers/gptq/ipex.py b/server/text_generation_server/layers/gptq/ipex.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab9c9e2475280c9ee5ca38fca34d54397e708dfd
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/ipex.py
@@ -0,0 +1,126 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+
+import intel_extension_for_pytorch as ipex
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [4]:
+            raise NotImplementedError("Only 4 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+        self.woq_linear = (
+            ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight(
+                self.qweight,
+                self.scales,
+                self.qzeros,
+                self.infeatures,
+                self.outfeatures,
+                bias=self.bias,
+                group_size=self.groupsize,
+                g_idx=g_idx,
+                quant_method=ipex.llm.quantization.QuantMethod.GPTQ_GEMM,
+                dtype=ipex.llm.quantization.QuantDtype.INT4,
+            )
+        )
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [4]:
+            raise NotImplementedError("Only 4 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        out = self.woq_linear(x.reshape(-1, x.shape[-1]))
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/layers/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py
index 8d029817a39b3d4be3546e23ba197d15d2cc5284..66fc15ec0e3fd5b3f76456d9850845cf66b536f3 100644
--- a/server/text_generation_server/layers/gptq/quantize.py
+++ b/server/text_generation_server/layers/gptq/quantize.py
@@ -12,9 +12,12 @@ from huggingface_hub import HfApi
 from accelerate import init_empty_weights
 from text_generation_server.utils import initialize_torch_distributed, Weights
 from text_generation_server.utils.hub import weight_files
-from text_generation_server.layers.gptq.quant_linear import QuantLinear
+from text_generation_server.layers.gptq import QuantLinear
 from loguru import logger
 from typing import Optional
+from text_generation_server.layers.gptq.utils import torch_snr_error
+
+from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight
 
 DEV = torch.device("cuda:0")
 
@@ -370,7 +373,7 @@ def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=False, trust_remote_code=trust_remote_code
         )
-    except:
+    except Exception:
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=True, trust_remote_code=trust_remote_code
         )
@@ -402,7 +405,7 @@ def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=False, trust_remote_code=trust_remote_code
         )
-    except:
+    except Exception:
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=True, trust_remote_code=trust_remote_code
         )
@@ -446,7 +449,7 @@ def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=False, trust_remote_code=trust_remote_code
         )
-    except:
+    except Exception:
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=True, trust_remote_code=trust_remote_code
         )
@@ -502,7 +505,7 @@ def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=False, trust_remote_code=trust_remote_code
         )
-    except:
+    except Exception:
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=True, trust_remote_code=trust_remote_code
         )
@@ -544,7 +547,7 @@ def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=False, trust_remote_code=trust_remote_code
         )
-    except:
+    except Exception:
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, use_fast=True, trust_remote_code=trust_remote_code
         )
@@ -698,6 +701,8 @@ def sequential(
                 pass
 
             def add_batch(name):
+                nonlocal gptq
+
                 def tmp(_, inp, out):
                     gptq[name].add_batch(inp[0].data, out.data)
 
@@ -869,6 +874,7 @@ def quantize(
     upload_to_model_id: Optional[str],
     percdamp: float,
     act_order: bool,
+    sym: bool,
 ):
     print("loading model")
     config = AutoConfig.from_pretrained(
@@ -891,6 +897,7 @@ def quantize(
         dtype=torch.float16,
         process_group=process_group,
         aliases={"embed_tokens.weight": ["lm_head.weight"]},
+        weights_loader=DefaultWeightsLoader(UnquantizedWeight),
     )
     hooks = []
     for name, module in model.named_modules():
@@ -943,6 +950,7 @@ def quantize(
         percdamp=percdamp,
         act_order=act_order,
         hooks=hooks,
+        sym=sym,
     )
     print(time.time() - tick)
 
@@ -952,8 +960,6 @@ def quantize(
 
     state_dict = model.state_dict()
     state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}
-    state_dict["gptq_bits"] = torch.LongTensor([bits])
-    state_dict["gptq_groupsize"] = torch.LongTensor([groupsize])
 
     max_shard_size = "10GB"
     shards, index = shard_checkpoint(
@@ -985,6 +991,15 @@ def quantize(
             f"index located at {save_index_file}."
         )
     config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+    config.quantization_config = {
+        "bits": bits,
+        "group_size": groupsize,
+        "damp_percent": percdamp,
+        "desc_act": act_order,
+        "static_groups": False,
+        "sym": sym,
+        "quant_method": "gptq",
+    }
     config.save_pretrained(output_dir)
     logger.info("Saved config")
     logger.info("Saving tokenizer")
diff --git a/server/text_generation_server/layers/gptq/quant_linear.py b/server/text_generation_server/layers/gptq/triton.py
similarity index 95%
rename from server/text_generation_server/layers/gptq/quant_linear.py
rename to server/text_generation_server/layers/gptq/triton.py
index b52ceb0f71f9a1a46d4916e3f58fbb8f1aff9159..736c357b094ef355b2cb2580cd81669df3ac6920 100644
--- a/server/text_generation_server/layers/gptq/quant_linear.py
+++ b/server/text_generation_server/layers/gptq/triton.py
@@ -1,356 +1,359 @@
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.cuda.amp import custom_fwd
-
-import triton
-import triton.language as tl
-from . import custom_autotune
-
-
-# code based https://github.com/fpgaminer/GPTQ-triton
-@custom_autotune.autotune(
-    configs=[
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 256,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 128,
-                "BLOCK_SIZE_N": 128,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 128,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 128,
-                "BLOCK_SIZE_N": 32,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 64,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 128,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=2,
-            num_warps=8,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 64,
-                "BLOCK_SIZE_K": 64,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 32,
-                "BLOCK_SIZE_N": 32,
-                "BLOCK_SIZE_K": 128,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=2,
-            num_warps=4,
-        ),
-    ],
-    key=["M", "N", "K"],
-    nearest_power_of_two=True,
-    prune_configs_by={
-        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
-        "perf_model": None,
-        "top_k": None,
-    },
-)
-@triton.jit
-def matmul_248_kernel(
-    a_ptr,
-    b_ptr,
-    c_ptr,
-    scales_ptr,
-    zeros_ptr,
-    g_ptr,
-    M,
-    N,
-    K,
-    bits,
-    maxq,
-    stride_am,
-    stride_ak,
-    stride_bk,
-    stride_bn,
-    stride_cm,
-    stride_cn,
-    stride_scales,
-    stride_zeros,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    """
-    Compute the matrix multiplication C = A x B.
-    A is of shape (M, K) float16
-    B is of shape (K//8, N) int32
-    C is of shape (M, N) float16
-    scales is of shape (G, N) float16
-    zeros is of shape (G, N) float16
-    g_ptr is of shape (K) int32
-    """
-    infearure_per_bits = 32 // bits
-
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (
-        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-    a_mask = offs_am[:, None] < M
-    # b_ptrs is set up such that it repeats elements along the K axis 8 times
-    b_ptrs = b_ptr + (
-        (offs_k[:, None] // infearure_per_bits) * stride_bk
-        + offs_bn[None, :] * stride_bn
-    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-    g_ptrs = g_ptr + offs_k
-    # shifter is used to extract the N bits of each element in the 32-bit word from B
-    scales_ptrs = scales_ptr + offs_bn[None, :]
-    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
-
-    shifter = (offs_k % infearure_per_bits) * bits
-    zeros_shifter = (offs_bn % infearure_per_bits) * bits
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
-    for k in range(0, num_pid_k):
-        g_idx = tl.load(g_ptrs)
-
-        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-        scales = tl.load(
-            scales_ptrs + g_idx[:, None] * stride_scales
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        zeros = tl.load(
-            zeros_ptrs + g_idx[:, None] * stride_zeros
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-
-        zeros = (zeros >> zeros_shifter[None, :]) & maxq
-        zeros = (zeros + 1) & maxq  # eventually avoid overflow
-
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-        # Now we need to unpack b (which is N-bit values) into 32-bit values
-        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-        b = (b - zeros) * scales  # Scale and shift
-
-        accumulator += tl.dot(a, b)
-        a_ptrs += BLOCK_SIZE_K
-        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-        g_ptrs += BLOCK_SIZE_K
-
-    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-    tl.store(c_ptrs, accumulator, mask=c_mask)
-
-
-def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    with torch.cuda.device(input.device):
-        output = torch.empty(
-            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
-        )
-        grid = lambda META: (
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
-            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
-        )
-        matmul_248_kernel[grid](
-            input,
-            qweight,
-            output,
-            scales,
-            qzeros,
-            g_idx,
-            input.shape[0],
-            qweight.shape[1],
-            input.shape[1],
-            bits,
-            maxq,
-            input.stride(0),
-            input.stride(1),
-            qweight.stride(0),
-            qweight.stride(1),
-            output.stride(0),
-            output.stride(1),
-            scales.stride(0),
-            qzeros.stride(0),
-        )
-        return output
-
-
-class QuantLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
-        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
-        return output
-
-
-class QuantLinear(nn.Module):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        self.register_buffer("qweight", qweight)
-        self.register_buffer("qzeros", qzeros)
-        self.register_buffer("scales", scales)
-        self.register_buffer("g_idx", g_idx)
-        if bias is not None:
-            self.register_buffer("bias", bias)
-        else:
-            self.bias = None
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.groupsize = groupsize
-
-        self.outfeatures = qweight.shape[1]
-        self.infeatures = qweight.shape[0] * 32 // bits
-
-    @classmethod
-    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
-        qzeros = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
-            dtype=torch.int32,
-        )
-        scales = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
-        )
-        g_idx = torch.tensor(
-            [i // groupsize for i in range(infeatures)], dtype=torch.int32
-        )
-        if bias:
-            bias = torch.zeros((outfeatures), dtype=torch.float16)
-        else:
-            bias = None
-        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
-
-    def pack(self, linear, scales, zeros, g_idx=None):
-        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
-
-        scales = scales.t().contiguous()
-        zeros = zeros.t().contiguous()
-        scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
-                    / self.scales[self.g_idx[idx]]
-                ).to(torch.int)[:, None]
-            )
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros(
-            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
-        )
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
-
-        zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros(
-            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
-        )
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
-
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
-            x.reshape(-1, x.shape[-1]),
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.bits,
-            self.maxq,
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.cuda.amp import custom_fwd
+
+import triton
+import triton.language as tl
+from . import custom_autotune
+
+
+# code based https://github.com/fpgaminer/GPTQ-triton
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=4,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_k[:, None] // infearure_per_bits) * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_bn[None, :]
+    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, num_pid_k):
+        g_idx = tl.load(g_ptrs)
+
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(
+            scales_ptrs + g_idx[:, None] * stride_scales
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(
+            zeros_ptrs + g_idx[:, None] * stride_zeros
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = (zeros + 1) & maxq  # eventually avoid overflow
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_ptrs += BLOCK_SIZE_K
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty(
+            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
+        )
+
+        def grid(META):
+            return (
+                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+                * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+            )
+
+        matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales,
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        return output
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        out = QuantLinearFunction.apply(
+            x.reshape(-1, x.shape[-1]),
+            self.qweight,
+            self.scales,
+            self.qzeros,
+            self.g_idx,
+            self.bits,
+            self.maxq,
+        )
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/layers/gptq/utils.py b/server/text_generation_server/layers/gptq/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbc0f391fafba5e22ba9ee04fd3779b9560cc2d2
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/utils.py
@@ -0,0 +1,56 @@
+import torch
+
+
+# copied from https://github.com/openppl-public/ppq/blob/master/ppq/quantization/measure/norm.py
+def torch_snr_error(
+    y_pred: torch.Tensor, y_real: torch.Tensor, reduction: str = "mean"
+) -> torch.Tensor:
+    """
+    Compute SNR between y_pred(tensor) and y_real(tensor)
+
+    SNR can be calcualted as following equation:
+
+        SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2
+
+    if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.
+
+        SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)
+
+    Args:
+        y_pred (torch.Tensor): _description_
+        y_real (torch.Tensor): _description_
+        reduction (str, optional): _description_. Defaults to 'mean'.
+
+    Raises:
+        ValueError: _description_
+        ValueError: _description_
+
+    Returns:
+        torch.Tensor: _description_
+    """
+    if y_pred.shape != y_real.shape:
+        raise ValueError(
+            f"Can not compute snr loss for tensors with different shape. "
+            f"({y_pred.shape} and {y_real.shape})"
+        )
+    reduction = str(reduction).lower()
+
+    if y_pred.ndim == 1:
+        y_pred = y_pred.unsqueeze(0)
+        y_real = y_real.unsqueeze(0)
+
+    y_pred = y_pred.flatten(start_dim=1)
+    y_real = y_real.flatten(start_dim=1)
+
+    noise_power = torch.pow(y_pred - y_real, 2).sum(dim=-1)
+    signal_power = torch.pow(y_real, 2).sum(dim=-1)
+    snr = (noise_power) / (signal_power + 1e-7)
+
+    if reduction == "mean":
+        return torch.mean(snr)
+    elif reduction == "sum":
+        return torch.sum(snr)
+    elif reduction == "none":
+        return snr
+    else:
+        raise ValueError("Unsupported reduction method.")
diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py
index 56925b42e171215d165b9f0f539b05e7482562f0..d0f4c5d4f0e64ef448c5f301ea135e5aac4c4668 100644
--- a/server/text_generation_server/layers/layernorm.py
+++ b/server/text_generation_server/layers/layernorm.py
@@ -72,6 +72,7 @@ if SYSTEM == "cuda":
                 return normed_hidden_states, residual
 
 elif SYSTEM == "rocm":
+    # from vllm._C import ops
     from vllm import _custom_ops
 
     class FastLayerNorm(nn.LayerNorm):
diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py
index 10ff2e138f018f3d901a142436750182d36ee2ab..fca4676b962f560f749f6093f164ca62e5287c2f 100644
--- a/server/text_generation_server/layers/linear.py
+++ b/server/text_generation_server/layers/linear.py
@@ -1,13 +1,25 @@
-from typing import Optional
 import torch
-from torch.nn import functional as F
 from text_generation_server.utils.import_utils import SYSTEM
+from torch.nn import functional as F
+import os
 
 # if SYSTEM == "rocm":
-#     try:
-#         from vllm import _custom_C
-#     except Exception as e:
-#         raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+#     ROCM_USE_SKINNY_GEMM = os.getenv("ROCM_USE_SKINNY_GEMM", "True").lower() in (
+#         "true",
+#         "1",
+#     )
+
+ROCM_USE_SKINNY_GEMM = False
+
+    # if ROCM_USE_SKINNY_GEMM:
+    #     try:
+    #         # from vllm import _custom_C
+    #         from vllm import _custom_ops
+    #         text-generation-inference
+    #     except Exception as e:
+    #         raise ImportError(
+    #             f"Could not load `vllm._custom_C` for ROCm skinny gemm. Full error: {e}"
+    #         )
 
 
 class FastLinear(torch.nn.Module):
@@ -49,6 +61,14 @@ class FastLinearROCm(torch.nn.Module):
         else:
             self.bias = None
 
+        self.cu_count = torch.cuda.get_device_properties(
+            device="cuda"
+        ).multi_processor_count
+        self.use_skinny_gemm = (
+            ROCM_USE_SKINNY_GEMM
+            and "gfx1" not in torch.cuda.get_device_properties("cuda").gcnArchName
+        )
+
     @classmethod
     def load(cls, config, prefix: str, weights, bias: bool):
         weight = weights.get_tensor(f"{prefix}.weight")
@@ -62,7 +82,11 @@ class FastLinearROCm(torch.nn.Module):
         weight = self.weight
         bias = self.bias
 
-        if SYSTEM == "rocm" and inp.numel() // inp.shape[-1] == 1:
+        if (
+            self.use_skinny_gemm
+            and inp.dtype == torch.float16
+            and inp.shape[-1] % 8 == 0
+        ):
             batched = False
             inp_shape = inp.shape
 
@@ -70,17 +94,22 @@ class FastLinearROCm(torch.nn.Module):
                 inp = inp.view(-1, inp_shape[-1])
                 batched = True
 
-            m, k = weight.shape[0], inp_shape[1]
-            out = torch.empty(
-                inp_shape[0], weight.shape[0], dtype=inp.dtype, device="cuda"
-            )
-            # if (k == 8192 and (m == 1280 or m == 7168)) or (k == 3584 and m == 8192):
-            #     _custom_C.LLMM1(weight, inp, out, 8)
-            # elif k <= 8192 and k % 8 == 0 and m % 4 == 0:
+            m, n, k = weight.shape[0], inp_shape[0], inp_shape[1]
+            # if m > 8 and n <= 4:
+            #     out = torch.empty(
+            #         inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
+            #     )
+            #     _custom_C.wvSpltK(weight, inp, out, n, self.cu_count)
+            # elif m % 4 == 0 and n == 1 and k <= 8192:
+            #     out = torch.empty(
+            #         inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
+            #     )
             #     _custom_C.LLMM1(weight, inp, out, 4)
             # else:
             #     out = F.linear(inp, weight)
 
+            out = F.linear(inp, weight)
+
             if batched:
                 out.view(*inp_shape[:-1], out.shape[-1])
 
@@ -90,168 +119,14 @@ class FastLinearROCm(torch.nn.Module):
         return F.linear(inp, self.weight, self.bias)
 
 
-def get_linear(weight, bias, quantize):
-    if quantize is None:
+def get_linear(weight, bias):
+    # Weights that are loaded through methods that are not
+    # quantization-aware are still bare tensors. We may want
+    # to change this in the future.
+    if isinstance(weight, torch.Tensor):
         if SYSTEM == "rocm":
-            # linear = FastLinearROCm(weight, bias) #TODO:can surport if add customized code.https://github.com/fxmarty/rocm-vllm/blob/main/csrc/custom/custom.cu
-            linear = FastLinear(weight, bias)
-        else:
-            linear = FastLinear(weight, bias)
-    elif quantize == "eetq":
-        try:
-            from text_generation_server.layers.eetq import EETQLinear
-
-            linear = EETQLinear(weight, bias)
-        except ImportError:
-            raise ImportError(
-                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
-            )
-    elif quantize == "fp8":
-        from text_generation_server.layers.fp8 import Fp8Linear
-
-        linear = Fp8Linear(weight, bias)
-    elif quantize == "bitsandbytes":
-        try:
-            from text_generation_server.layers.bnb import (
-                warn_deprecate_bnb,
-                Linear8bitLt,
-            )
-        except ImportError:
-            raise NotImplementedError(
-                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
-            )
-        warn_deprecate_bnb()
-        linear = Linear8bitLt(
-            weight,
-            bias,
-            has_fp16_weights=False,
-            threshold=6.0,
-        )
-        if bias is not None:
-            linear.bias = nn.Parameter(bias)
-    elif quantize == "bitsandbytes-fp4":
-        try:
-            from text_generation_server.layers.bnb import Linear4bit
-        except ImportError:
-            raise NotImplementedError(
-                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
-            )
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="fp4",
-        )
-    elif quantize == "bitsandbytes-nf4":
-        try:
-            from text_generation_server.layers.bnb import Linear4bit
-        except ImportError:
-            raise NotImplementedError(
-                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
-            )
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="nf4",
-        )
-    elif quantize == "exl2":
-        from text_generation_server.layers.exl2 import Exl2Weight
-
-        if not isinstance(weight, Exl2Weight):
-            raise NotImplementedError(
-                f"The passed weight is not `exl2` compatible, loader needs to be updated."
-            )
-
-        from text_generation_server.layers.gptq import ExllamaQuantLinear
-
-        linear = ExllamaQuantLinear(weight, bias)
-
-    elif quantize == "gptq":
-        from text_generation_server.layers.gptq import GPTQWeight
-        from text_generation_server.layers.marlin import (
-            GPTQMarlinLinear,
-            GPTQMarlinWeight,
-        )
-
-        if isinstance(weight, GPTQMarlinWeight):
-            linear = GPTQMarlinLinear(
-                weight=weight,
-                bias=bias,
-            )
-        elif isinstance(weight, GPTQWeight):
-            if weight.use_exllama:
-                try:
-                    from text_generation_server.layers.gptq import (
-                        ExllamaQuantLinear,
-                    )
-                except ImportError:
-                    raise NotImplementedError(
-                        f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
-                    )
-
-                linear = ExllamaQuantLinear(weight, bias)
-            else:
-                from text_generation_server.layers.gptq.quant_linear import QuantLinear
-
-                linear = QuantLinear(
-                    weight.qweight,
-                    weight.qzeros,
-                    weight.scales,
-                    weight.g_idx,
-                    bias,
-                    weight.bits,
-                    weight.groupsize,
-                )
+            return FastLinearROCm(weight, bias)
         else:
-            raise NotImplementedError(
-                f"The passed weight is not `gptq` compatible, loader needs to be updated."
-            )
-
-    elif quantize == "awq":
-        from text_generation_server.layers.gptq import GPTQWeight
-
-        if not isinstance(weight, GPTQWeight):
-            raise NotImplementedError(
-                f"The passed weight is not `awq` compatible, loader needs to be updated."
-            )
-        if SYSTEM == "rocm":
-            raise NotImplementedError(
-                "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
-                "to use Exllama/GPTQ kernels for AWQ inference."
-            )
-        try:
-            from text_generation_server.layers.awq.quantize.qmodule import WQLinear
+            return FastLinear(weight, bias)
 
-            linear = WQLinear(
-                w_bit=weight.bits,
-                group_size=weight.groupsize,
-                qweight=weight.qweight,
-                qzeros=weight.qzeros,
-                scales=weight.scales,
-                bias=bias,
-            )
-        except ImportError:
-            raise NotImplementedError(
-                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
-            )
-    elif quantize == "marlin":
-        from text_generation_server.layers.marlin import (
-            GPTQMarlin24Linear,
-            GPTQMarlin24Weight,
-            MarlinLinear,
-            MarlinWeight,
-        )
-
-        if isinstance(weight, GPTQMarlin24Weight):
-            linear = GPTQMarlin24Linear(
-                weight=weight,
-                bias=bias,
-            )
-        elif isinstance(weight, MarlinWeight):
-            linear = MarlinLinear(weight=weight, bias=bias)
-        else:
-            raise NotImplementedError(
-                f"The passed weight is not `marlin` compatible, loader needs to be updated."
-            )
-    else:
-        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
-    return linear
+    return weight.get_linear(bias)
diff --git a/server/text_generation_server/layers/lora.py b/server/text_generation_server/layers/lora.py
index 0bb6db41a0c10966e0ec4e98c92bba40fe796fb9..a4537b55bbf9951c10791fd50684532dcc3ca014 100644
--- a/server/text_generation_server/layers/lora.py
+++ b/server/text_generation_server/layers/lora.py
@@ -1,12 +1,8 @@
-import math
-import os
-from typing import TYPE_CHECKING, Optional, Tuple, List
+from typing import TYPE_CHECKING, Optional, List
 
 import torch
 import torch.distributed
-from accelerate import init_empty_weights
 from torch import nn
-from torch.nn import functional as F
 from torch.distributed import ProcessGroup
 
 from text_generation_server.utils.sgmv import (
@@ -43,10 +39,7 @@ class LoraLinear(nn.Module):
     ) -> torch.Tensor:
         if adapter_data is None:
             return result
-        data = adapter_data.data.get(layer_type)
-        data: Optional["BatchLoraWeights"] = (
-            data.get("lora") if data is not None else None
-        )
+        data: Optional["BatchLoraWeights"] = adapter_data.data.get(layer_type)
 
         if has_sgmv() and data is not None and data.can_vectorize(self.process_group):
             # In tensor-parallel configurations, each GPU processes a specific segment of the output.
diff --git a/server/text_generation_server/layers/marlin.py b/server/text_generation_server/layers/marlin.py
deleted file mode 100644
index a1af67a3f5fb11da1abf1a7acae8a1c5ba6b3d7d..0000000000000000000000000000000000000000
--- a/server/text_generation_server/layers/marlin.py
+++ /dev/null
@@ -1,410 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-from text_generation_server.layers.gptq import GPTQParams
-from text_generation_server.utils.import_utils import SYSTEM
-
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
-
-try:
-    major, _minor = torch.cuda.get_device_capability()
-    has_sm_8_0 = major >= 8
-except Exception:
-    has_sm_8_0 = False
-
-
-GPTQ_MARLIN_BITS = [4, 8]
-GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128]
-MARLIN_TILE_SIZE = 16
-
-
-def can_use_gptq_marlin(gptq_params: GPTQParams, quantize: str) -> bool:
-    return (
-        SYSTEM == "cuda"
-        and marlin_kernels is not None
-        and has_sm_8_0
-        and quantize == "gptq"
-        and gptq_params.quant_method == "gptq"
-        and gptq_params.bits in GPTQ_MARLIN_BITS
-        and gptq_params.groupsize in GPTQ_MARLIN_GROUP_SIZES
-        and gptq_params.sym
-    )
-
-
-def _check_marlin_kernels():
-    if not (SYSTEM == "cuda" and has_sm_8_0):
-        raise NotImplementedError(
-            "Using quantized Marlin models requires a GPU with CUDA capability 8.0 or later."
-        )
-
-    if marlin_kernels is None:
-        raise NotImplementedError(
-            "marlin is not installed, install it with: pip install server/marlin"
-        )
-
-
-def _check_valid_shape(in_features: int, out_features: int):
-    if (in_features % 128 != 0 or out_features % 64 != 0) and (
-        in_features % 64 != 0 or out_features % 128 != 0
-    ):
-        raise ValueError(
-            f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})."
-            " The shape elements must be divisible by (128, 64) or (64, 128)."
-        )
-
-
-# https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54
-def _get_perms() -> Tuple[List[int], List[int]]:
-    scale_perm = []
-    for i in range(8):
-        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
-    for i in range(4):
-        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return scale_perm, scale_perm_single
-
-
-_scale_perm, _scale_perm_single = _get_perms()
-
-
-def permute_scales(scales: torch.Tensor):
-    out_features = scales.shape[1]
-    if scales.shape[0] == 1:
-        scales = scales.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
-    else:
-        scales = scales.reshape((-1, len(_scale_perm)))[:, _scale_perm]
-    return scales.reshape((-1, out_features)).contiguous()
-
-
-@dataclass
-class GPTQMarlinWeight:
-    """
-    Repacked GPTQ Marlin weights.
-    """
-
-    qweight: torch.Tensor
-    scales: torch.Tensor
-    g_idx: torch.Tensor
-    perm: torch.Tensor
-    bits: int
-    is_full_k: bool
-
-    def __post_init__(self):
-        assert self.qweight.dtype == torch.int32
-        assert self.scales.dtype == torch.float16
-        assert self.g_idx.dtype == torch.int32
-        assert self.perm.dtype == torch.int32
-
-
-def repack_gptq_for_marlin(
-    *,
-    qweight: torch.Tensor,
-    scales: torch.Tensor,
-    g_idx: torch.Tensor,
-    bits: int,
-    desc_act: bool,
-    groupsize: int,
-    sym: bool,
-    sharded_infeatures: bool,
-) -> GPTQMarlinWeight:
-    """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""
-    _check_marlin_kernels()
-    assert marlin_kernels is not None
-
-    if bits not in GPTQ_MARLIN_BITS:
-        supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
-        raise RuntimeError(
-            f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}"
-        )
-
-    if groupsize not in GPTQ_MARLIN_GROUP_SIZES:
-        supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES)
-        raise RuntimeError(
-            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
-        )
-    if not sym:
-        raise RuntimeError(
-            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
-        )
-
-    weights_per_int = 32 // bits
-    in_features = qweight.shape[0] * weights_per_int
-    out_features = qweight.shape[1]
-
-    if in_features % groupsize != 0:
-        raise ValueError(
-            f"Number of input features ({in_features}) not divisible by group size ({groupsize})"
-        )
-
-    if desc_act and groupsize != -1:
-        perm = torch.argsort(g_idx).to(torch.int)
-        g_idx = g_idx[perm]
-    else:
-        perm = torch.empty(0, dtype=torch.int, device=qweight.device)
-        g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)
-
-    repacked = marlin_kernels.gptq_marlin_repack(
-        qweight, perm, in_features, out_features, bits
-    )
-
-    scales = permute_scales(scales)
-
-    is_full_k = not (desc_act and sharded_infeatures)
-
-    return GPTQMarlinWeight(
-        qweight=repacked,
-        scales=scales,
-        g_idx=g_idx,
-        perm=perm,
-        bits=bits,
-        is_full_k=is_full_k,
-    )
-
-
-class GPTQMarlinLinear(nn.Module):
-    """
-    Linear layer for GPTQ weights that were converted for the GPTQ-Marlin
-    kernels.
-    """
-
-    def __init__(
-        self,
-        *,
-        weight: GPTQMarlinWeight,
-        bias: Optional[torch.Tensor],
-    ):
-        super().__init__()
-
-        _check_marlin_kernels()
-        assert marlin_kernels is not None
-
-        in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE
-        out_features = weight.scales.shape[1]
-        _check_valid_shape(in_features=in_features, out_features=out_features)
-
-        self.bits = weight.bits
-        self.is_full_k = weight.is_full_k
-
-        self.qweight = weight.qweight
-        self.scales = weight.scales
-        self.g_idx = weight.g_idx
-        self.perm = weight.perm
-        if bias is not None:
-            self.bias = bias
-        else:
-            self.bias = None
-
-        self.workspace = torch.zeros(
-            out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device
-        )
-
-    def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
-
-        A_flat = A.view(-1, A.shape[-1])
-        C = marlin_kernels.gptq_marlin_gemm(
-            A_flat,
-            self.qweight,
-            self.scales,
-            self.g_idx,
-            self.perm,
-            self.workspace,
-            self.bits,
-            A_flat.shape[0],
-            self.scales.shape[1],
-            A_flat.shape[1],
-            self.is_full_k,
-        )
-        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
-
-        if self.bias is not None:
-            C += self.bias
-
-        return C
-
-
-GPTQ_MARLIN_24_MIN_THREAD_N = 128
-GPTQ_MARLIN_24_MIN_THREAD_K = 128
-GPTQ_MARLIN_24_MAX_PARALLEL = 64
-GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
-GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
-
-
-@dataclass
-class GPTQMarlin24Weight:
-    """
-    GPTQ-Marlin 2:4 weights.
-
-    Attributes:
-        B (torch.Tensor): int4-quantized weights packed into int32.
-        B_meta (torch.Tensor): metadata for 2:4 sparsity.
-        s (torch.Tensor): float16 scales.
-        bits: quantized weight size.
-    """
-
-    B: torch.Tensor
-    B_meta: torch.Tensor
-    s: torch.Tensor
-    bits: int
-
-    def __post_init__(self):
-        assert self.B.dtype == torch.int32
-        assert self.B_meta.dtype == torch.int16
-        assert self.s.dtype == torch.float16
-
-
-class GPTQMarlin24Linear(nn.Module):
-    def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch.Tensor]):
-        super().__init__()
-
-        _check_marlin_kernels()
-        assert marlin_kernels is not None
-
-        if weight.bits not in GPTQ_MARLIN_BITS:
-            supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
-            raise RuntimeError(
-                f"{weight.bits}-bit GPTQ Sparse 2:4 Marlin is not supported, must be one of: {supported_bits}"
-            )
-
-        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE * 2
-        out_features = weight.s.shape[1]
-        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
-
-        if groupsize not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
-            supported_sizes = ", ".join(
-                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
-            )
-            raise RuntimeError(
-                f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
-            )
-
-        self.bits = weight.bits
-        weights_per_int32 = 32 // self.bits
-
-        assert (
-            out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
-        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_N} threads"
-        assert (
-            out_features % weights_per_int32 == 0
-        ), f"Number of output features ({out_features}) not divisable by weights per int32 ({weights_per_int32})"
-
-        assert (
-            in_features % GPTQ_MARLIN_24_MIN_THREAD_K == 0
-        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_K} threads"
-        if groupsize != -1 and in_features % groupsize != 0:
-            raise ValueError(
-                f"Number of input features ({in_features}) not divisable by group size ({groupsize})"
-            )
-
-        self.B = weight.B
-        self.B_meta = weight.B_meta
-        self.s = weight.s
-        if bias is not None:
-            self.bias = bias
-        else:
-            self.bias = None
-
-        self.workspace = torch.zeros(
-            (out_features // GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL,
-            dtype=torch.int,
-            device=weight.B.device,
-        )
-
-    def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
-
-        C = marlin_kernels.gptq_marlin_24_gemm(
-            A.view(-1, A.shape[-1]),
-            self.B,
-            self.B_meta,
-            self.s,
-            self.workspace,
-            self.bits,
-            A.shape[0],
-            self.s.shape[1],
-            A.shape[1],
-        )
-
-        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
-
-        if self.bias is not None:
-            C += self.bias
-
-        return C
-
-
-@dataclass
-class MarlinWeight:
-    """
-    Marlin weights.
-
-    Attributes:
-        B (torch.Tensor): int4-quantized weights packed into int32.
-        s (torch.Tensor): float16 scales.
-    """
-
-    B: torch.Tensor
-    s: torch.Tensor
-
-    def __post_init__(self):
-        assert self.B.dtype == torch.int32
-        assert self.s.dtype == torch.float16
-
-
-class MarlinLinear(nn.Module):
-    def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tensor]):
-        super().__init__()
-
-        _check_marlin_kernels()
-        assert marlin_kernels is not None
-
-        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE
-        out_features = weight.s.shape[1]
-        assert (
-            in_features % 128 == 0
-        ), f"Number of input features ({in_features}) not divisable by 128"
-        assert (
-            out_features % 256 == 0
-        ), f"Number of output features ({out_features}) not divisable by 256"
-
-        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
-        assert groupsize in {
-            -1,
-            128,
-        }, f"Group size must be -1 or 128, was {groupsize}"
-
-        self.B = weight.B
-        self.s = weight.s
-        if bias is not None:
-            self.bias = bias
-        else:
-            self.bias = None
-
-        self.workspace = torch.zeros(
-            out_features // 64 * 16, dtype=torch.int, device=weight.B.device
-        )
-
-    def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
-
-        C = marlin_kernels.marlin_gemm(
-            A.view(-1, A.shape[-1]),
-            self.B,
-            self.s,
-            self.workspace,
-            A.shape[0],
-            self.s.shape[1],
-            A.shape[1],
-        )
-        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
-
-        if self.bias is not None:
-            C += self.bias
-
-        return C
diff --git a/server/text_generation_server/layers/marlin/__init__.py b/server/text_generation_server/layers/marlin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ff3ed58f18c656db073d15adf48f4a1bf61c025
--- /dev/null
+++ b/server/text_generation_server/layers/marlin/__init__.py
@@ -0,0 +1,15 @@
+from text_generation_server.layers.marlin.fp8 import GPTQMarlinFP8Linear
+from text_generation_server.layers.marlin.gptq import (
+    GPTQMarlinWeightsLoader,
+    can_use_gptq_marlin,
+    repack_gptq_for_marlin,
+)
+from text_generation_server.layers.marlin.marlin import MarlinWeightsLoader
+
+__all__ = [
+    "GPTQMarlinFP8Linear",
+    "GPTQMarlinWeightsLoader",
+    "MarlinWeightsLoader",
+    "can_use_gptq_marlin",
+    "repack_gptq_for_marlin",
+]
diff --git a/server/text_generation_server/layers/marlin/fp8.py b/server/text_generation_server/layers/marlin/fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..49f5c480f3f0f03ac160c5d21c4aef0ceab47966
--- /dev/null
+++ b/server/text_generation_server/layers/marlin/fp8.py
@@ -0,0 +1,147 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from loguru import logger
+from text_generation_server.layers.fp8 import fp8_quantize
+from text_generation_server.layers.marlin.gptq import _check_valid_shape
+from text_generation_server.layers.marlin.util import (
+    _check_marlin_kernels,
+    permute_scales,
+)
+from text_generation_server.utils.log import log_once
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+
+MARLIN_TILE_SIZE = 16
+
+
+class GPTQMarlinFP8Linear(nn.Module):
+    """
+    FP8 GPTQ-Marlin linear layer.
+    """
+
+    def __init__(
+        self,
+        qweight: torch.Tensor,
+        scales: torch.Tensor,
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        log_once(logger.info, "GPU does not support FP8, using Marlin FP8 kernel")
+
+        scales = scales.unsqueeze(0)
+        if scales.shape[1] == 1:
+            out_features, in_features = qweight.shape
+            scales = scales.repeat(1, out_features)
+        qweight, scales = repack_fp8_for_marlin(qweight, scales)
+
+        in_features = qweight.shape[0] * MARLIN_TILE_SIZE
+        out_features = scales.shape[1]
+        _check_valid_shape(in_features=in_features, out_features=out_features)
+
+        self.qweight = qweight
+        self.scales = scales
+        self.bias = bias if bias is not None else None
+
+        self.workspace = torch.zeros(
+            out_features // 64 * 16, dtype=torch.int, device=qweight.device
+        )
+
+    @classmethod
+    def from_unquant(cls, weight, bias, dtype):
+        qweight, scales = fp8_quantize(weight)
+        return cls(qweight=qweight, scales=scales.to(dtype), bias=bias)
+
+    @classmethod
+    def from_fp8(
+        cls,
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        bias: torch.Tensor,
+        dtype: torch.dtype,
+        **kwargs,
+    ):
+        return cls(qweight=weight, scales=scale.to(dtype), bias=bias)
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        A_flat = A.view(-1, A.shape[-1])
+        C = marlin_kernels.fp8_marlin_gemm(
+            A_flat,
+            self.qweight,
+            self.scales,
+            self.workspace,
+            8,
+            A_flat.shape[0],
+            self.scales.shape[1],
+            A_flat.shape[1],
+        )
+        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
+
+
+def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Repack FP8 weights to gptq format (packed int32 elements).
+    """
+    assert fp8_tensor.dtype == torch.float8_e4m3fn
+
+    if fp8_tensor.shape[0] % 4 != 0:
+        raise ValueError(
+            f"Leading tensor dimension is not divisable by 4: {fp8_tensor.shape[0]}"
+        )
+
+    # Reshape to prepare for packing
+    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
+
+    # Convert fp8 to uint8 (byte) representation
+    byte_tensor = reshaped.view(torch.uint8)
+
+    # Pack 4 uint8 values into one int32
+    packed = torch.zeros(
+        fp8_tensor.shape[0] // 4,
+        fp8_tensor.shape[1],
+        dtype=torch.int32,
+        device=fp8_tensor.device,
+    )
+
+    for i in range(4):
+        packed.bitwise_or_(byte_tensor[:, i].to(torch.int32) << i * 8)
+
+    return packed
+
+
+def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor):
+    """
+    Repack FP8 tensor for GPTQ-Marlin.
+    """
+
+    out_features, in_features = weight.shape
+
+    # Torch linear layers weights with shape [out_features, in_features],
+    # GPTQ-quantized weights use [in_feateres/pack_factor, in_features],
+    # so transpose before packing.
+    qweight = pack_fp8_as_int32(weight.t())
+
+    perm = torch.empty(0, dtype=torch.int, device=qweight.device)
+    repacked = marlin_kernels.gptq_marlin_repack(
+        qweight, perm, in_features, out_features, 8
+    )
+
+    scales = permute_scales(scales)
+
+    return repacked, scales
diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..47341c0f0eb9d0b814ce7a75333953be8c52dfd9
--- /dev/null
+++ b/server/text_generation_server/layers/marlin/gptq.py
@@ -0,0 +1,464 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy
+import torch
+import torch.nn as nn
+from loguru import logger
+from text_generation_server.layers.marlin.util import (
+    _check_marlin_kernels,
+    marlin_zero_points,
+    permute_scales,
+    unpack_cols,
+)
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+    has_sm_8_0 = major >= 8
+except Exception:
+    has_sm_8_0 = False
+
+
+GPTQ_MARLIN_BITS = [4, 8]
+GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128]
+MARLIN_TILE_SIZE = 16
+
+
+def can_use_gptq_marlin(
+    *, bits: int, groupsize: int, quant_method: str, quantize: str, sym: bool
+) -> bool:
+    return (
+        SYSTEM == "cuda"
+        and marlin_kernels is not None
+        and has_sm_8_0
+        and quantize in {"awq", "gptq"}
+        and quant_method in {"awq", "gptq"}
+        and bits in GPTQ_MARLIN_BITS
+        and groupsize in GPTQ_MARLIN_GROUP_SIZES
+        # We only support asymmetric quantization for AWQ.
+        and (sym or quant_method == "awq")
+    )
+
+
+class GPTQMarlinWeightsLoader(WeightsLoader):
+    """
+    Loader for using GPTQ- and AWQ-quantized weights with Marlin kernels.
+    """
+
+    def __init__(
+        self,
+        *,
+        bits: int,
+        desc_act: bool,
+        groupsize: int,
+        quant_method: str,
+        quantize: str,
+        sym: bool,
+    ):
+        self.bits = bits
+        self.desc_act = desc_act
+        self.groupsize = groupsize
+        self.quant_method = quant_method
+        self.quantize = quantize
+        self.sym = sym
+
+    def get_weights(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            qweight = weights.get_tensor(f"{prefix}.qweight")
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
+            )
+
+        if not self.sym:
+            qzeros = weights.get_tensor(f"{prefix}.qzeros")
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        scales = weights.get_tensor(f"{prefix}.scales")
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        try:
+            qweight = weights.get_packed_sharded(
+                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
+            )
+        scales = weights.get_packed_sharded(
+            f"{prefix}.scales", dim=1, block_sizes=block_sizes
+        )
+        scales = scales.to(dtype=weights.dtype)
+
+        if not self.sym:
+            qzeros = weights.get_packed_sharded(
+                f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
+            )
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        try:
+            qweight = torch.cat(
+                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat(
+            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
+        )
+
+        if not self.sym:
+            qzeros = torch.cat(
+                [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
+            )
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
+            )
+
+        if not self.sym:
+            if self.desc_act or self.groupsize == -1:
+                qzeros = weights.get_tensor(f"{prefix}.qzeros")
+            else:
+                qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
+
+        if self.desc_act or self.groupsize == -1:
+            scales = weights.get_tensor(f"{prefix}.scales")
+        else:
+            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
+
+        sharded_in_features = weights.process_group.size() > 1
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=sharded_in_features,
+        )
+
+    def _get_gptq_params(self, weights: Weights):
+        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
+            self.bits = weights.get_tensor("gptq_bits").item()
+            self.groupsize = weights.get_tensor("gptq_groupsize").item()
+            self.desc_act = False
+            # `server quantize` used asymmetric quantization unconditionally
+            # before the `gptq_sym` setting tensor was added.
+            self.sym = (
+                weights.get_tensor("gptq_sym").item()
+                if weights.has_tensor("gptq_sym")
+                else False
+            )
+            self.quant_method = "gptq"
+
+
+@dataclass
+class GPTQMarlinWeight(Weight):
+    """
+    Repacked GPTQ Marlin weights.
+    """
+
+    qweight: torch.Tensor
+    qzeros: torch.Tensor
+    scales: torch.Tensor
+    g_idx: torch.Tensor
+    perm: torch.Tensor
+    bits: int
+    is_full_k: bool
+
+    def __post_init__(self):
+        assert self.qweight.dtype == torch.int32
+        assert self.scales.dtype == torch.float16
+        assert self.g_idx.dtype == torch.int32
+        assert self.perm.dtype == torch.int32
+
+    def get_linear(self, bias: torch.Tensor):
+        return GPTQMarlinLinear(
+            weight=self,
+            bias=bias,
+        )
+
+
+def repack_gptq_for_marlin(
+    *,
+    qweight: torch.Tensor,
+    qzeros: Optional[torch.Tensor],
+    scales: torch.Tensor,
+    g_idx: Optional[torch.Tensor],
+    bits: int,
+    desc_act: bool,
+    groupsize: int,
+    quant_method: str,
+    sym: bool,
+    sharded_infeatures: bool,
+) -> GPTQMarlinWeight:
+    """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""
+    _check_marlin_kernels()
+    assert marlin_kernels is not None
+
+    if bits not in GPTQ_MARLIN_BITS:
+        supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
+        raise RuntimeError(
+            f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}"
+        )
+
+    if groupsize not in GPTQ_MARLIN_GROUP_SIZES:
+        supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES)
+        raise RuntimeError(
+            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
+        )
+    if not (sym or quant_method == "awq"):
+        raise RuntimeError(
+            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
+        )
+
+    log_once(logger.info, f"Converting {quant_method} model to Marlin packing format.")
+
+    weights_per_int = 32 // bits
+    in_features = qweight.shape[0]
+    out_features = qweight.shape[1]
+
+    # AWQ uses column packing, GPTQ uses row packing
+    if quant_method == "awq":
+        out_features *= weights_per_int
+    else:
+        in_features *= weights_per_int
+
+    if in_features % groupsize != 0:
+        raise ValueError(
+            f"Number of input features ({in_features}) not divisible by group size ({groupsize})"
+        )
+
+    if g_idx is not None and desc_act and groupsize != -1:
+        perm = torch.argsort(g_idx).to(torch.int)
+        g_idx = g_idx[perm]
+    else:
+        perm = torch.empty(0, dtype=torch.int, device=qweight.device)
+        g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)
+
+    if quant_method == "awq":
+        repacked = marlin_kernels.awq_marlin_repack(
+            qweight, in_features, out_features, bits
+        )
+        if qzeros is not None:
+            qzeros = awq_to_marlin_zero_points(
+                qzeros,
+                in_features // groupsize,
+                out_features,
+                bits,
+            )
+
+    else:
+        repacked = marlin_kernels.gptq_marlin_repack(
+            qweight, perm, in_features, out_features, bits
+        )
+
+    if qzeros is None:
+        qzeros = torch.empty(0, dtype=torch.int, device=qweight.device)
+
+    scales = permute_scales(scales)
+
+    is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)
+
+    return GPTQMarlinWeight(
+        qweight=repacked,
+        qzeros=qzeros,
+        scales=scales,
+        g_idx=g_idx,
+        perm=perm,
+        bits=bits,
+        is_full_k=is_full_k,
+    )
+
+
+class GPTQMarlinLinear(nn.Module):
+    """
+    Linear layer for GPTQ weights that were converted for the GPTQ-Marlin
+    kernels.
+    """
+
+    def __init__(
+        self,
+        *,
+        weight: GPTQMarlinWeight,
+        bias: Optional[torch.Tensor],
+    ):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE
+        out_features = weight.scales.shape[1]
+        _check_valid_shape(in_features=in_features, out_features=out_features)
+
+        self.bits = weight.bits
+        self.is_full_k = weight.is_full_k
+
+        self.qweight = weight.qweight
+        self.qzeros = weight.qzeros
+        self.scales = weight.scales
+        self.g_idx = weight.g_idx
+        self.perm = weight.perm
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        A_flat = A.view(-1, A.shape[-1])
+        C = marlin_kernels.gptq_marlin_gemm(
+            A_flat,
+            self.qweight,
+            self.scales,
+            self.qzeros,
+            self.g_idx,
+            self.perm,
+            self.workspace,
+            self.bits,
+            A_flat.shape[0],
+            self.scales.shape[1],
+            A_flat.shape[1],
+            self.is_full_k,
+            self.qzeros.numel() > 0,
+            True,
+        )
+        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
+
+
+def awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    # AWQ zero-points are quantized and packed on the column dim.
+    # In addition, the values are permuted based on dequantizer.
+    # Here we undo both of these, and then apply marlin permutation
+    # and pack it back.
+    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
+
+    # Undo interleaving (use argsort(..) to get inverse perm)
+    if num_bits == 4:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
+    elif num_bits == 8:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
+    q_zp = q_zp.reshape((-1, size_n)).contiguous()
+
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    return marlin_zp
+
+
+def _check_valid_shape(in_features: int, out_features: int):
+    if (in_features % 128 != 0 or out_features % 64 != 0) and (
+        in_features % 64 != 0 or out_features % 128 != 0
+    ):
+        raise ValueError(
+            f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})."
+            " The shape elements must be divisible by (128, 64) or (64, 128)."
+        )
diff --git a/server/text_generation_server/layers/marlin/marlin.py b/server/text_generation_server/layers/marlin/marlin.py
new file mode 100644
index 0000000000000000000000000000000000000000..89ebaca62d127f8a4969673bd070907d1dc0b6fb
--- /dev/null
+++ b/server/text_generation_server/layers/marlin/marlin.py
@@ -0,0 +1,346 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from text_generation_server.layers.marlin.util import _check_marlin_kernels
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+
+class MarlinWeightsLoader(WeightsLoader):
+    """Loader for Marlin-quantized weights."""
+
+    def __init__(self, *, bits: int, is_marlin_24: bool):
+        self.bits = bits
+        self.is_marlin_24 = is_marlin_24
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply without tensor paralllism.
+        """
+        is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
+        if is_marlin_24:
+            try:
+                B = weights.get_tensor(f"{prefix}.B_24")
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
+                )
+
+            B_meta = weights.get_tensor(f"{prefix}.B_meta")
+            s = weights.get_tensor(f"{prefix}.s")
+            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
+        else:
+            try:
+                B = weights.get_tensor(f"{prefix}.B")
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` weight, make sure the model is already quantized."
+                )
+
+            s = weights.get_tensor(f"{prefix}.s")
+            weight = MarlinWeight(B=B, s=s)
+
+        return weight
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        if self.is_marlin_24:
+            B = weights.get_packed_sharded(
+                f"{prefix}.B_24", dim=1, block_sizes=block_sizes
+            )
+            B_meta = weights.get_packed_sharded(
+                f"{prefix}.B_meta", dim=1, block_sizes=block_sizes
+            )
+            s = weights.get_packed_sharded(
+                f"{prefix}.s", dim=1, block_sizes=block_sizes
+            )
+
+            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
+        else:
+            B = weights.get_packed_sharded(
+                f"{prefix}.B", dim=1, block_sizes=block_sizes
+            )
+            s = weights.get_packed_sharded(
+                f"{prefix}.s", dim=1, block_sizes=block_sizes
+            )
+            weight = MarlinWeight(B=B, s=s)
+
+        return weight
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        if self.is_marlin_24:
+            try:
+                B = torch.cat(
+                    [weights.get_sharded(f"{p}.B_24", dim=1) for p in prefixes], dim=1
+                )
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` weight, make sure the model is already quantized"
+                )
+
+            B_meta = torch.cat(
+                [weights.get_sharded(f"{p}.B_meta", dim=1) for p in prefixes], dim=1
+            )
+
+            s = torch.cat(
+                [weights.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
+            )
+
+            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
+        else:
+            try:
+                B = torch.cat(
+                    [weights.get_sharded(f"{p}.B", dim=1) for p in prefixes], dim=1
+                )
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` weight, make sure the model is already quantized"
+                )
+            s = torch.cat(
+                [weights.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
+            )
+
+            weight = MarlinWeight(B=B, s=s)
+
+        return weight
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        if self.is_marlin_24:
+            try:
+                B = weights.get_sharded(f"{prefix}.B_24", dim=0)
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
+                )
+
+            B_meta = weights.get_sharded(f"{prefix}.B_meta", dim=0)
+            num_groups = weights._get_slice(f"{prefix}.s").get_shape()[0]
+            if num_groups == 1:
+                # The number of groups is 1 when groupsize == -1. share
+                # scales between all shards in this case.
+                s = weights.get_tensor(f"{prefix}.s")
+            else:
+                s = weights.get_sharded(f"{prefix}.s", dim=0)
+
+            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
+        else:
+            try:
+                B = weights.get_sharded(f"{prefix}.B", dim=0)
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` weight, make sure the model is already quantized."
+                )
+
+            num_groups = weights._get_slice(f"{prefix}.s").get_shape()[0]
+            if num_groups == 1:
+                # The number of groups is 1 when groupsize == -1. share
+                # scales between all shards in this case.
+                s = weights.get_tensor(f"{prefix}.s")
+            else:
+                s = weights.get_sharded(f"{prefix}.s", dim=0)
+            weight = MarlinWeight(B=B, s=s)
+
+        return weight
+
+
+@dataclass
+class MarlinWeight(Weight):
+    """
+    Marlin weights.
+
+    Attributes:
+        B (torch.Tensor): int4-quantized weights packed into int32.
+        s (torch.Tensor): bfloat16/float16 scales.
+    """
+
+    B: torch.Tensor
+    s: torch.Tensor
+
+    def __post_init__(self):
+        assert self.B.dtype == torch.int32
+        assert self.s.dtype in [torch.float16, torch.bfloat16]
+
+    def get_linear(self, bias: torch.Tensor):
+        return MarlinLinear(weight=self, bias=bias)
+
+
+class MarlinLinear(nn.Module):
+    def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tensor]):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE
+        out_features = weight.s.shape[1]
+        assert (
+            in_features % 128 == 0
+        ), f"Number of input features ({in_features}) not divisable by 128"
+        assert (
+            out_features % 256 == 0
+        ), f"Number of output features ({out_features}) not divisable by 256"
+
+        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
+        assert groupsize in {
+            -1,
+            128,
+        }, f"Group size must be -1 or 128, was {groupsize}"
+
+        self.B = weight.B
+        self.s = weight.s
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            out_features // 64 * 16, dtype=torch.int, device=weight.B.device
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        C = marlin_kernels.marlin_gemm(
+            A.view(-1, A.shape[-1]),
+            self.B,
+            self.s,
+            self.workspace,
+            A.shape[0],
+            self.s.shape[1],
+            A.shape[1],
+        )
+        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
+
+
+GPTQ_MARLIN_24_MIN_THREAD_N = 128
+GPTQ_MARLIN_24_MIN_THREAD_K = 128
+GPTQ_MARLIN_24_MAX_PARALLEL = 64
+GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
+GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
+MARLIN_TILE_SIZE = 16
+
+
+@dataclass
+class GPTQMarlin24Weight:
+    """
+    GPTQ-Marlin 2:4 weights.
+
+    Attributes:
+        B (torch.Tensor): int4-quantized weights packed into int32.
+        B_meta (torch.Tensor): metadata for 2:4 sparsity.
+        s (torch.Tensor): float16 scales.
+        bits: quantized weight size.
+    """
+
+    B: torch.Tensor
+    B_meta: torch.Tensor
+    s: torch.Tensor
+    bits: int
+
+    def __post_init__(self):
+        assert self.B.dtype == torch.int32
+        assert self.B_meta.dtype == torch.int16
+        assert self.s.dtype == torch.float16
+
+    def get_linear(self, bias: torch.Tensor):
+        return GPTQMarlin24Linear(
+            weight=self,
+            bias=bias,
+        )
+
+
+class GPTQMarlin24Linear(nn.Module):
+    def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch.Tensor]):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        if weight.bits not in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS:
+            supported_bits = ", ".join(
+                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+            )
+            raise RuntimeError(
+                f"{weight.bits}-bit GPTQ Sparse 2:4 Marlin is not supported, must be one of: {supported_bits}"
+            )
+
+        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE * 2
+        out_features = weight.s.shape[1]
+        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
+
+        if groupsize not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
+            supported_sizes = ", ".join(
+                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
+            )
+            raise RuntimeError(
+                f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
+            )
+
+        self.bits = weight.bits
+        weights_per_int32 = 32 // self.bits
+
+        assert (
+            out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
+        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_N} threads"
+        assert (
+            out_features % weights_per_int32 == 0
+        ), f"Number of output features ({out_features}) not divisable by weights per int32 ({weights_per_int32})"
+
+        assert (
+            in_features % GPTQ_MARLIN_24_MIN_THREAD_K == 0
+        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_K} threads"
+        if groupsize != -1 and in_features % groupsize != 0:
+            raise ValueError(
+                f"Number of input features ({in_features}) not divisable by group size ({groupsize})"
+            )
+
+        self.B = weight.B
+        self.B_meta = weight.B_meta
+        self.s = weight.s
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            (out_features // GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL,
+            dtype=torch.int,
+            device=weight.B.device,
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        C = marlin_kernels.gptq_marlin_24_gemm(
+            A.view(-1, A.shape[-1]),
+            self.B,
+            self.B_meta,
+            self.s,
+            self.workspace,
+            self.bits,
+            A.shape[0],
+            self.s.shape[1],
+            A.shape[1],
+        )
+
+        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
diff --git a/server/text_generation_server/layers/marlin/util.py b/server/text_generation_server/layers/marlin/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..250d171418aa3d327be7be552593558c2c94e2d1
--- /dev/null
+++ b/server/text_generation_server/layers/marlin/util.py
@@ -0,0 +1,141 @@
+import functools
+from typing import List, Tuple
+
+import numpy
+import torch
+from text_generation_server.utils.import_utils import SYSTEM
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+    has_sm_8_0 = major >= 8
+except Exception:
+    has_sm_8_0 = False
+
+
+def _check_marlin_kernels():
+    if not (SYSTEM == "cuda" and has_sm_8_0):
+        raise NotImplementedError(
+            "Using quantized Marlin models requires a GPU with CUDA capability 8.0 or later."
+        )
+
+    if marlin_kernels is None:
+        raise NotImplementedError(
+            "marlin is not installed, install it with: pip install server/marlin"
+        )
+
+
+# https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54
+@functools.cache
+def get_perms() -> Tuple[List[int], List[int]]:
+    scale_perm = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single = []
+    for i in range(4):
+        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def permute_scales(scales: torch.Tensor):
+    scale_perm, scale_perm_single = get_perms()
+    out_features = scales.shape[1]
+    if scales.shape[0] == 1:
+        scales = scales.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    else:
+        scales = scales.reshape((-1, len(scale_perm)))[:, scale_perm]
+    return scales.reshape((-1, out_features)).contiguous()
+
+
+# Functions below are from vLLM
+
+
+def get_pack_factor(bits: int) -> int:
+    if 32 % bits != 0:
+        raise ValueError(f"Cannot {bits} bit values into uint32")
+    return 32 // bits
+
+
+def pack_cols(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def unpack_cols(
+    packed_q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+    assert packed_q_w.shape == (
+        size_k,
+        size_n // pack_factor,
+    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
+        packed_q_w.shape, size_k, size_n, pack_factor
+    )
+
+    orig_device = packed_q_w.device
+
+    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
+
+    mask = (1 << num_bits) - 1
+    for i in range(pack_factor):
+        vals = packed_q_w_cpu & mask
+        packed_q_w_cpu >>= num_bits
+        q_res[:, i::pack_factor] = vals
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def marlin_zero_points(
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    scale_perm, _ = get_perms()
+    # Permute zero-points in a similar way to scales, but do not use the
+    # "single" permutation, since zero-points are applied on every MMA
+    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
+
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    zp = zp.reshape((-1, size_n)).contiguous()
+    zp = pack_cols(zp, num_bits, size_k, size_n)
+
+    return zp
diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py
index 7579ccdbd36f9899ad9f86b4bbaa66e0d9a00434..139c4dc250067b05581bc907fbb2303d2ea8164a 100644
--- a/server/text_generation_server/layers/medusa.py
+++ b/server/text_generation_server/layers/medusa.py
@@ -32,6 +32,8 @@ class MedusaModel(torch.nn.Module):
         )
 
     def forward(self, x):
+        if not self.heads:
+            return None
         speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
         return speculative_logits
 
diff --git a/server/text_generation_server/layers/mlp.py b/server/text_generation_server/layers/mlp.py
index f08cb673617b0181acff4ddace6c3cb39c904863..d33b41f323bd464ad406deacef4670d941e65d88 100644
--- a/server/text_generation_server/layers/mlp.py
+++ b/server/text_generation_server/layers/mlp.py
@@ -45,12 +45,107 @@ class MLPSpeculatorLayerNorm(nn.Module):
         return x
 
 
+INV_SQRT2 = 2**-0.5
+
+
+def simple_norm(x: torch.Tensor, eps=1e-06):
+    xf = x
+    xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + eps)
+    x = xf.type_as(x)
+    return x * INV_SQRT2
+
+
+class MLPSpeculatorModelTied(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.config = config
+        self.n_predict = get_speculate()
+        self.hidden_size = config.hidden_size
+
+        self.emb = TensorParallelEmbedding(f"{prefix}.emb.0", weights)
+        self.proj0 = FastLinear.load(
+            config,
+            prefix=f"{prefix}.proj.0",
+            weights=weights,
+            bias=False,
+        )
+        self.proj1 = FastLinear.load(
+            config,
+            prefix=f"{prefix}.proj.1",
+            weights=weights,
+            bias=False,
+        )
+        self.head = FastLinear.load(config, f"{prefix}.head.0", weights, bias=False)
+        self.ln = MLPSpeculatorLayerNorm(
+            prefix=f"{prefix}.ln.0",
+            config=config,
+            weights=weights,
+        )
+
+        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
+        self.state_weight = 0.5 ** (0.5 / self.n_predict) if self.n_predict > 0 else 1
+        self.activation = nn.GELU()
+        self.vsize = config.vocab_size
+        self.inner_dim = config.speculator_config["inner_dim"]
+        self.top_k_tokens_per_head = [1] * self.n_predict
+        self.emb_weight = math.sqrt(1 - self.state_weight**2) * math.sqrt(
+            self.inner_dim / 2
+        )
+        self.emb.weight *= self.emb_weight
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_ids: torch.Tensor,
+    ):
+        top_k_tokens_per_head = self.top_k_tokens_per_head
+
+        # k indicates # of candidates
+        # h indicates # of generated tokens
+        state = hidden_states
+        b = state.size(0)
+        ind = input_ids.unsqueeze(0)
+        all_probs = torch.empty(
+            b, self.n_predict, self.vsize, device=state.device
+        )  # b k h v
+        assert (
+            len(top_k_tokens_per_head) == self.n_predict
+        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
+        for i in range(self.n_predict):
+            # Project and predict
+            z = self.emb(ind)
+            # z = z.mul(self.emb_weight)  # b k d
+            if i == 0:
+                state = self.proj0(state) * self.state_weight + z
+            else:
+                state = self.proj1(state) * self.state_weight + z
+            state = self.activation(self.ln(state))  # b k d
+            probs = F.log_softmax(self.head(state), dim=-1)  # b k v
+            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'
+
+            # Update candidate set with new predictions
+
+            # Update distribution set with new logits
+            all_probs[:, i] = probs.exp()
+
+            # Update state, log_probs and ind for new predictions
+            state = state.unsqueeze(2).expand(
+                -1, -1, top_k_tokens_per_head[i], -1
+            )  # b k k' d
+            state = state.reshape(-1, b, state.size(3))  # b kk' d
+            ind = preds.view(-1, b)  # b kk'
+
+        speculative_logits = all_probs
+        return speculative_logits
+
+
 class MLPSpeculatorModel(torch.nn.Module):
     def __init__(self, config, prefix, weights):
         super().__init__()
         self.config = config
         self.n_predict = get_speculate()
         self.hidden_size = config.hidden_size
+
         self.emb = nn.ModuleList(
             [
                 TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
@@ -84,13 +179,15 @@ class MLPSpeculatorModel(torch.nn.Module):
         )
 
         # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
-        self.state_weight = 0.5 ** (0.5 / self.n_predict)
-        self.emb_weight = math.sqrt(1 - self.state_weight**2)
+        self.state_weight = 0.5 ** (0.5 / self.n_predict) if self.n_predict > 0 else 1
         self.activation = nn.GELU()
-        # TODO
         self.vsize = config.vocab_size
         self.inner_dim = config.speculator_config["inner_dim"]
         self.top_k_tokens_per_head = [1] * self.n_predict
+        self.emb_weight = math.sqrt(1 - self.state_weight**2) * math.sqrt(
+            self.inner_dim / 2
+        )
+        self.emb.weight *= self.emb_weight
 
     def forward(
         self,
@@ -113,7 +210,7 @@ class MLPSpeculatorModel(torch.nn.Module):
         for i in range(self.n_predict):
             # Project and predict
             z = self.emb[i](ind)
-            z = z.mul(self.emb_weight * math.sqrt(self.inner_dim / 2))  # b k d
+            # z = z.mul(self.emb_weight)  # b k d
             state = self.proj[i](state) * self.state_weight + z
             state = self.activation(self.ln[i](state))  # b k d
             probs = F.log_softmax(self.head[i](state), dim=-1)  # b k v
@@ -136,10 +233,11 @@ class MLPSpeculatorModel(torch.nn.Module):
 
 
 class MLPSpeculatorHead(nn.Module):
-    def __init__(self, lm_head, mlp_speculator):
+    def __init__(self, lm_head, mlp_speculator, scale_input: bool):
         super().__init__()
         self.lm_head = lm_head
         self.mlp_speculator = mlp_speculator
+        self.scale_input = scale_input
 
     def forward(
         self, input: torch.Tensor
@@ -150,6 +248,8 @@ class MLPSpeculatorHead(nn.Module):
             return logits, None
 
         input_ids = logits.argmax(dim=-1)
+        if self.scale_input:
+            input = simple_norm(input)
         speculative_logits = self.mlp_speculator(input, input_ids)
         return logits, speculative_logits
 
@@ -171,6 +271,12 @@ class MLPSpeculatorHead(nn.Module):
                         )
                     routing[k] = filename
 
-        mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
+        tie_weights = config.speculator_config.get("tie_weights", False)
+        if tie_weights:
+            mlp_speculator = MLPSpeculatorModelTied(config, "speculator", weights)
+        else:
+            mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
+        # This is used in https://huggingface.co/ibm-fms/llama3-70b-accelerator
+        scale_input = config.speculator_config.get("scale_input", False)
         lm_head = TensorParallelHead.load(config, prefix, weights)
-        return MLPSpeculatorHead(lm_head, mlp_speculator)
+        return MLPSpeculatorHead(lm_head, mlp_speculator, scale_input)
diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..558d9ed97ba8f6d22a6377241fd6c1f9f39ef4c3
--- /dev/null
+++ b/server/text_generation_server/layers/moe/__init__.py
@@ -0,0 +1,263 @@
+from typing import Optional, Protocol, runtime_checkable
+
+import torch
+import torch.nn as nn
+from loguru import logger
+from transformers.activations import ACT2FN
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
+from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader
+from text_generation_server.layers.moe.gptq_marlin import (
+    GPTQMarlinSparseMoELayer,
+    can_use_marlin_moe_gemm,
+)
+from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    Weights,
+    UnquantizedWeight,
+)
+
+if SYSTEM == "rocm":
+    from .fused_moe_rocm import grouped_topk
+    from vllm.model_executor.layers.fused_moe import fused_topk
+elif SYSTEM != "ipex":
+    from moe_kernels.fused_moe import fused_topk, grouped_topk
+
+
+# NOTE: we are using a protocol here, because multiple inherance is not nice.
+#       We need `Module`, and `Module` -> some abstract class -> some concrete
+#       class inheritance is whacky.
+
+
+@runtime_checkable
+class MoELayer(Protocol):
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+        hidden_act: str = "silu",
+    ): ...
+
+    def forward(
+        self, x: torch.Tensor, *, gating_output: torch.Tensor
+    ) -> torch.Tensor: ...
+
+
+class DenseMoELayer(nn.Module):
+    """
+    Layer for MoE that applies *all* experts to each tokens and then weights
+    their outputs based on the calculated routing. This layer is much slower
+    than `SparseMoELayer` and should only be used when no fused kernels are
+    available (e.g. for unsupported quantizers).
+    """
+
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+        hidden_act: str = "silu",
+    ):
+        super().__init__()
+
+        log_once(
+            logger.info,
+            "No fused layers are available for this model type, using (slower) dense MoE layer",
+        )
+
+        assert (n_expert_group is None) == (
+            topk_group is None
+        ), "n_expert_group and topk_group must both be None or have some value"
+
+        self.n_expert_group = n_expert_group
+        self.n_experts = n_experts
+        self.renormalize = renormalize
+        self.topk = topk
+        self.topk_group = topk_group
+
+        if "gelu" in hidden_act:
+            self.act = lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh"
+                    if hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
+                    else "none"
+                ),
+            )
+        elif "silu" in hidden_act:
+            self.act = torch.nn.functional.silu
+        else:
+            self.act = ACT2FN[hidden_act]
+
+        self.gate_proj = [
+            TensorParallelColumnLinear.load(
+                None,
+                prefix=f"{prefix}.{i}.{gate_proj_name}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_experts)
+        ]
+        self.up_proj = [
+            TensorParallelColumnLinear.load(
+                None,
+                prefix=f"{prefix}.{i}.{up_proj_name}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_experts)
+        ]
+        self.down_proj = [
+            TensorParallelRowLinear.load(
+                None,
+                prefix=f"{prefix}.{i}.{down_proj_name}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_experts)
+        ]
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        """
+        x: (sequence_length, model_dim)
+        gating_output: (sequence_length, n_experts)
+        """
+        # optional reshape
+        input_shape = x.shape
+        x = x.view(-1, input_shape[-1])
+
+        if self.n_expert_group is not None and self.topk_group is not None:
+            topk_weights, topk_ids = grouped_topk(
+                x,
+                gating_output,
+                self.topk,
+                renormalize=self.renormalize,
+                num_expert_group=self.n_expert_group,
+                topk_group=self.topk_group,
+            )
+        else:
+            topk_weights, topk_ids = fused_topk(
+                x, gating_output, self.topk, self.renormalize
+            )
+            topk_weights = topk_weights.to(x.dtype)
+
+        weights = torch.zeros(
+            topk_ids.shape[0], self.n_experts, dtype=x.dtype, device=x.device
+        )
+
+        weights.scatter_(1, topk_ids.long(), topk_weights.to(weights.dtype))
+
+        out = torch.zeros_like(x)
+        for i in range(self.n_experts):
+            h = self.act(self.gate_proj[i](x)) * self.up_proj[i](x)
+            h = self.down_proj[i](h, reduce=False)
+            out += h * weights[:, i].view(-1, 1)
+
+        return out
+
+
+class SparseMoELayer(nn.Module):
+    """
+    Layer for MoE that uses fused kernels to only apply the active experts
+    for each token (rather than applying all experts and selecting the
+    outputs of active experts).
+    """
+
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        if (
+            isinstance(weights.loader, DefaultWeightsLoader)
+            and isinstance(weights.loader.weight_class, UnquantizedWeight)
+        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
+            cls = UnquantizedSparseMoELayer
+        elif isinstance(
+            weights.loader, GPTQMarlinWeightsLoader
+        ) and can_use_marlin_moe_gemm(
+            quant_method=weights.loader.quant_method,
+            quantize=weights.loader.quantize,
+            sym=weights.loader.sym,
+        ):
+            cls = GPTQMarlinSparseMoELayer
+        else:
+            raise ValueError(
+                f"Unsupported weights loader: {type(weights.loader)}, sparse MoE is only supported for unquantized, AWQ, and GPTQ weights"
+            )
+
+        log_once(
+            logger.info,
+            "Using MoE layer wih fused gemm",
+        )
+
+        self.moe = cls(
+            n_expert_group=n_expert_group,
+            n_experts=n_experts,
+            prefix=prefix,
+            renormalize=renormalize,
+            topk=topk,
+            topk_group=topk_group,
+            weights=weights,
+            gate_proj_name=gate_proj_name,
+            up_proj_name=up_proj_name,
+            down_proj_name=down_proj_name,
+        )
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        return self.moe(x, gating_output=gating_output)
+
+    @staticmethod
+    def is_supported(weights: Weights) -> bool:
+        return (
+            (
+                isinstance(weights.loader, DefaultWeightsLoader)
+                and isinstance(weights.loader.weight_class, UnquantizedWeight)
+            )
+            or isinstance(weights.loader, HybridFP8UnquantLoader)
+            or (
+                isinstance(weights.loader, GPTQMarlinWeightsLoader)
+                and can_use_marlin_moe_gemm(
+                    quant_method=weights.loader.quant_method,
+                    quantize=weights.loader.quantize,
+                    sym=weights.loader.sym,
+                )
+            )
+        )
diff --git a/server/text_generation_server/layers/moe/fused_moe_rocm.py b/server/text_generation_server/layers/moe/fused_moe_rocm.py
new file mode 100644
index 0000000000000000000000000000000000000000..68accb99022e2a136be41aecd49148c8662f4aab
--- /dev/null
+++ b/server/text_generation_server/layers/moe/fused_moe_rocm.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import torch
+import torch.distributed
+
+
+# TODO: Remove the functions once moe_kernel are built for ROCM
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_ids
diff --git a/server/text_generation_server/layers/moe/gptq_marlin.py b/server/text_generation_server/layers/moe/gptq_marlin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d4ca9d857fd93b21f825decdea042d2a233eb26
--- /dev/null
+++ b/server/text_generation_server/layers/moe/gptq_marlin.py
@@ -0,0 +1,228 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import Weights
+from text_generation_server.layers.marlin.gptq import (
+    GPTQMarlinWeight,
+    GPTQMarlinWeightsLoader,
+)
+
+if SYSTEM == "cuda":
+    from moe_kernels.fused_marlin_moe import fused_marlin_moe
+else:
+    fused_marlin_moe = None
+
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+    has_sm_8_0 = major >= 8
+except Exception:
+    has_sm_8_0 = False
+
+
+def can_use_marlin_moe_gemm(
+    *,
+    quant_method: str,
+    quantize: str,
+    sym: bool,
+):
+    return (
+        SYSTEM == "cuda"
+        and fused_marlin_moe is not None
+        and has_sm_8_0
+        and quantize in {"awq", "gptq"}
+        and quant_method in {"awq", "gptq"}
+        # We only support asymmetric quantization for AWQ.
+        and (sym or quant_method == "awq")
+    )
+
+
+@dataclass
+class GPTQMarlinMoEWeight:
+    qweight: torch.Tensor
+    qzeros: torch.Tensor
+    scales: torch.Tensor
+    g_idx: torch.Tensor
+    perm: torch.Tensor
+    is_full_k: bool
+
+
+class GPTQMarlinSparseMoELayer(nn.Module):
+    """
+    MoE layer that uses a fused GPTQ-Marlin kernel.
+    """
+
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        if not (
+            isinstance(weights.loader, GPTQMarlinWeightsLoader)
+            and can_use_marlin_moe_gemm(
+                quant_method=weights.loader.quant_method,
+                quantize=weights.loader.quantize,
+                sym=weights.loader.sym,
+            )
+        ):
+            raise ValueError(
+                f"Unsupported weights loader: {type(weights.loader)}, only GPTQMarlinWeightsLoader with AWQ and symmetric GPTQ quantization is supported"
+            )
+
+        assert (n_expert_group is None) == (
+            topk_group is None
+        ), "n_expert_group and topk_group must both be None or have some value"
+
+        self.n_expert_group = n_expert_group
+        self.topk = topk
+        self.topk_group = topk_group
+        self.renormalize = renormalize
+
+        self.gate_up_proj = _load_expert_multi_weights_col(
+            prefix=prefix,
+            n_experts=n_experts,
+            names=[gate_proj_name, up_proj_name],
+            weights=weights,
+        )
+
+        self.down_proj = _load_expert_weights_row(
+            prefix=prefix, n_experts=n_experts, name=down_proj_name, weights=weights
+        )
+
+        self.bits = weights.loader.bits
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        return fused_marlin_moe(
+            hidden_states=x,
+            w1=self.gate_up_proj.qweight,
+            w2=self.down_proj.qweight,
+            w1_scale=self.gate_up_proj.scales,
+            w2_scale=self.down_proj.scales,
+            w1_zeros=(
+                self.gate_up_proj.qzeros
+                if self.gate_up_proj.qzeros.numel() > 0
+                else None
+            ),
+            w2_zeros=(
+                self.down_proj.qzeros if self.down_proj.qzeros.numel() > 0 else None
+            ),
+            g_idx1=self.gate_up_proj.g_idx,
+            g_idx2=self.down_proj.g_idx,
+            sort_indices1=self.gate_up_proj.perm,
+            sort_indices2=self.down_proj.perm,
+            is_k_full=self.gate_up_proj.is_full_k or self.down_proj.is_full_k,
+            gating_output=gating_output,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            use_grouped_topk=self.n_expert_group is not None,
+            num_expert_group=self.n_expert_group,
+            topk_group=self.topk_group,
+            num_bits=self.bits,
+        )
+
+
+def _load_expert_multi_weights_col(
+    *,
+    prefix: str,
+    n_experts: int,
+    names: List[str],
+    weights: Weights,
+) -> GPTQMarlinMoEWeight:
+    moe_weight = None
+    for i in range(n_experts):
+        weight = weights.get_multi_weights_col(
+            [f"{prefix}.{i}.{name}" for name in names], 0
+        )
+        assert isinstance(weight, GPTQMarlinWeight)
+        moe_weight = _pack_weight(
+            n_experts=n_experts, expert=i, weight=weight, moe_weight=moe_weight
+        )
+    assert moe_weight is not None
+    return moe_weight
+
+
+def _load_expert_weights_row(
+    *,
+    prefix: str,
+    n_experts: int,
+    name: str,
+    weights: Weights,
+) -> GPTQMarlinMoEWeight:
+    moe_weight = None
+    for i in range(n_experts):
+        weight = weights.get_weights_row(
+            f"{prefix}.{i}.{name}",
+        )
+        assert isinstance(weight, GPTQMarlinWeight)
+        moe_weight = _pack_weight(
+            n_experts=n_experts, expert=i, weight=weight, moe_weight=moe_weight
+        )
+    assert moe_weight is not None
+    return moe_weight
+
+
+def _pack_weight(
+    *,
+    n_experts: int,
+    expert: int,
+    moe_weight: Optional[GPTQMarlinMoEWeight],
+    weight: GPTQMarlinWeight,
+) -> GPTQMarlinMoEWeight:
+    if moe_weight is None:
+        qweight = torch.empty(
+            (n_experts,) + weight.qweight.shape,
+            dtype=weight.qweight.dtype,
+            device=weight.qweight.device,
+        )
+        qzeros = torch.empty(
+            (n_experts,) + weight.qzeros.shape,
+            dtype=weight.qzeros.dtype,
+            device=weight.qzeros.device,
+        )
+        scales = torch.empty(
+            (n_experts,) + weight.scales.shape,
+            dtype=weight.scales.dtype,
+            device=weight.scales.device,
+        )
+        g_idx = torch.empty(
+            (n_experts,) + weight.g_idx.shape,
+            dtype=weight.g_idx.dtype,
+            device=weight.g_idx.device,
+        )
+        perm = torch.empty(
+            (n_experts,) + weight.perm.shape,
+            dtype=weight.perm.dtype,
+            device=weight.perm.device,
+        )
+
+        moe_weight = GPTQMarlinMoEWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            perm=perm,
+            is_full_k=weight.is_full_k,
+        )
+
+    moe_weight.qweight[expert] = weight.qweight
+    moe_weight.qzeros[expert] = weight.qzeros
+    moe_weight.scales[expert] = weight.scales
+    moe_weight.g_idx[expert] = weight.g_idx
+    moe_weight.perm[expert] = weight.perm
+
+    return moe_weight
diff --git a/server/text_generation_server/layers/moe/unquantized.py b/server/text_generation_server/layers/moe/unquantized.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9d62c0ef47c5e7be645799d67f1bca55c20f449
--- /dev/null
+++ b/server/text_generation_server/layers/moe/unquantized.py
@@ -0,0 +1,138 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import UnquantizedWeight, Weights
+
+if SYSTEM == "rocm":
+    from vllm.model_executor.layers.fused_moe import fused_moe
+elif SYSTEM != "ipex":
+    from moe_kernels.fused_moe import fused_moe
+
+
+class UnquantizedSparseMoELayer(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        assert (n_expert_group is None) == (
+            topk_group is None
+        ), "n_expert_group and topk_group must both be None or have some value"
+
+        self.n_expert_group = n_expert_group
+        self.topk = topk
+        self.topk_group = topk_group
+        self.renormalize = renormalize
+
+        self.gate_up_proj = _load_expert_multi_weights_col(
+            prefix=prefix,
+            n_experts=n_experts,
+            gate_proj_name=gate_proj_name,
+            up_proj_name=up_proj_name,
+            weights=weights,
+        )
+
+        self.down_proj = _load_expert_weights_row(
+            prefix=prefix,
+            n_experts=n_experts,
+            name=down_proj_name,
+            weights=weights,
+        )
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        if SYSTEM == "rocm":
+            return fused_moe(
+                x,
+                self.gate_up_proj,
+                self.down_proj,
+                gating_output,
+                self.topk,
+                renormalize=self.renormalize,
+                inplace=True,
+            )
+
+        return fused_moe(
+            x,
+            w1=self.gate_up_proj,
+            w2=self.down_proj,
+            gating_output=gating_output,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            inplace=True,
+            use_grouped_topk=self.n_expert_group is not None,
+            num_expert_group=self.n_expert_group,
+            topk_group=self.topk_group,
+        )
+
+
+def _load_expert_multi_weights_col(
+    *,
+    prefix: str,
+    n_experts: int,
+    gate_proj_name: str,
+    up_proj_name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    all_weight = None
+    for i in range(n_experts):
+        weight = weights.get_multi_weights_col(
+            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
+        )
+
+        assert isinstance(weight, UnquantizedWeight)
+
+        if all_weight is None:
+            all_weight = torch.empty(
+                (n_experts,) + weight.weight.shape,
+                dtype=weight.weight.dtype,
+                device=weight.weight.device,
+            )
+
+        all_weight[i] = weight.weight
+
+    assert all_weight is not None
+
+    return all_weight
+
+
+def _load_expert_weights_row(
+    *,
+    prefix: str,
+    n_experts: int,
+    name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    all_weight = None
+    for i in range(n_experts):
+        weight = weights.get_weights_row(
+            f"{prefix}.{i}.{name}",
+        )
+
+        assert isinstance(weight, UnquantizedWeight)
+
+        if all_weight is None:
+            all_weight = torch.empty(
+                (n_experts,) + weight.weight.shape,
+                dtype=weight.weight.dtype,
+                device=weight.weight.device,
+            )
+
+        all_weight[i] = weight.weight
+
+    assert all_weight is not None
+
+    return all_weight
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
index 635ede7d485eee622914ab87c6b5d90c87d90b0c..58f5dd8dd231678b7b29023d52cc42553b3c4606 100644
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -1,14 +1,14 @@
 import os
+import math
 import torch
 from torch import nn
-
 from text_generation_server.utils.import_utils import SYSTEM
 
 if SYSTEM == "cuda":
-    from flash_attn.layers.rotary import RotaryEmbedding
     import rotary_emb
 elif SYSTEM == "rocm":
-    from vllm import _custom_ops
+    # from vllm._C import ops
+    import vllm._custom_ops as ops
 elif SYSTEM == "ipex":
     import intel_extension_for_pytorch as ipex
 
@@ -68,6 +68,7 @@ class PositionRotaryEmbedding(nn.Module):
             head_size = query.shape[-1]
 
             # Inplace operation, updating query and key.
+            # ops.rotary_embedding(query, key, head_size, cos, sin, True)
             torch.ops._C.rotary_embedding_tgi(query, key, head_size, cos, sin, True)
         elif SYSTEM == "ipex":
             ipex.llm.functional.rotary_embedding(
@@ -84,9 +85,13 @@ class PositionRotaryEmbedding(nn.Module):
         scaling_factor = None
         rope_scaling = _get_rope_config(config)
         if rope_scaling is not None:
-            if rope_scaling["type"] == "linear":
+            # `rope_type` is now standard in transformers, but some existing models
+            # have `type` instead.
+            rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
+
+            if rope_type == "linear":
                 pass
-            elif rope_scaling["type"] == "dynamic":
+            elif rope_type == "dynamic":
                 scaling_factor = rope_scaling["factor"]
                 return DynamicPositionRotaryEmbedding(
                     dim=dim,
@@ -95,22 +100,39 @@ class PositionRotaryEmbedding(nn.Module):
                     device=inv_freq.device,
                     scaling_factor=scaling_factor,
                 )
-            elif rope_scaling["type"] == "yarn":
+            elif rope_type == "llama3":
+                inv_freq = apply_llama3_scaling(
+                    inv_freq,
+                    scaling_factor=rope_scaling["factor"],
+                    low_freq_factor=rope_scaling["low_freq_factor"],
+                    high_freq_factor=rope_scaling["high_freq_factor"],
+                    original_max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                )
+
+                return cls(inv_freq, scaling_factor)
+
+            elif rope_type == "yarn":
                 scaling_factor = rope_scaling["factor"]
+                mscale = rope_scaling.get("mscale", 1.0)
+                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
                 return YarnPositionRotaryEmbedding(
                     dim=2 * inv_freq.shape[0],
                     max_position_embeddings=rope_scaling[
                         "original_max_position_embeddings"
                     ],
-                    base=10000.0,
+                    base=base,
                     device=inv_freq.device,
                     scaling_factor=scaling_factor,
                     extrapolation_factor=1,
                     attn_factor=1,
                     beta_fast=32,
                     beta_slow=1,
+                    mscale=mscale,
+                    mscale_all_dim=mscale_all_dim,
                 )
-            elif rope_scaling["type"] == "su":
+            elif rope_type in ["su", "longrope"]:
                 short_factor = torch.tensor(
                     rope_scaling["short_factor"], dtype=torch.float32, device=device
                 )
@@ -146,6 +168,20 @@ class PositionRotaryEmbedding(nn.Module):
                         1 + math.log(scale) / math.log(original_max_position_embeddings)
                     )
 
+                # if short_mscale and long_mscale are provided we need to scale the freqs
+                # using the Phi3LongRoPEScaledRotaryEmbedding
+                if ("short_mscale" in rope_scaling) and ("long_mscale" in rope_scaling):
+                    short_mscale = rope_scaling["short_mscale"]
+                    long_mscale = rope_scaling["long_mscale"]
+                    return Phi3LongRoPEScaledRotaryEmbedding(
+                        short_inv_freq=short_inv_freq,
+                        long_inv_freq=long_inv_freq,
+                        max_position_embeddings=config.max_position_embeddings,
+                        short_mscale=short_mscale,
+                        long_mscale=long_mscale,
+                        original_max_position_embeddings=original_max_position_embeddings,
+                    )
+
                 return SuRotaryEmbedding(
                     short_inv_freq=short_inv_freq,
                     long_inv_freq=long_inv_freq,
@@ -181,6 +217,8 @@ class PositionRotaryEmbedding(nn.Module):
                     scaling_factor=scaling_factor,
                 )
             elif rope_scaling["type"] == "yarn":
+                mscale = rope_scaling.get("mscale", 1.0)
+                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
                 return YarnPositionRotaryEmbedding(
                     dim=2 * inv_freq.shape[0],
                     max_position_embeddings=rope_scaling[
@@ -193,6 +231,8 @@ class PositionRotaryEmbedding(nn.Module):
                     attn_factor=1,
                     beta_fast=32,
                     beta_slow=1,
+                    mscale=mscale,
+                    mscale_all_dim=mscale_all_dim,
                 )
             else:
                 raise NotImplementedError(
@@ -263,6 +303,7 @@ class SuRotaryEmbedding(PositionRotaryEmbedding):
         # or if we're on a new device (possibly due to tracing for instance)
         if (
             seqlen > self._seq_len_cached
+            or self._cos_cached is None
             or self._cos_cached.device != device
             or self._cos_cached.dtype != dtype
         ):
@@ -284,6 +325,63 @@ class SuRotaryEmbedding(PositionRotaryEmbedding):
             self._sin_cached = (torch.sin(freqs) * self.scaling_factor).to(dtype)
 
 
+class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        short_inv_freq: torch.Tensor,
+        long_inv_freq: torch.Tensor,
+        max_position_embeddings: int,
+        short_mscale: float,
+        long_mscale: float,
+        original_max_position_embeddings: int,
+    ):
+        super(PositionRotaryEmbedding, self).__init__()
+        self.short_inv_freq = short_inv_freq
+        self.long_inv_freq = long_inv_freq
+        self.max_position_embeddings = max_position_embeddings
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
+        self.original_max_position_embeddings = original_max_position_embeddings
+
+        # cache
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.dynamic_args = None
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)
+
+            short_freqs = torch.outer(
+                t[: self.original_max_position_embeddings],
+                self.short_inv_freq.to(device=t.device),
+            )
+
+            long_freqs = torch.outer(
+                t[self.original_max_position_embeddings :],
+                self.long_inv_freq.to(device=t.device),
+            )
+
+            short_freqs = short_freqs * self.short_mscale
+            long_freqs = long_freqs * self.long_mscale
+
+            freqs = torch.empty((seqlen, short_freqs.shape[1]), device=device)
+            freqs[: self.original_max_position_embeddings] = short_freqs
+            freqs[self.original_max_position_embeddings :] = long_freqs
+
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
 class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
     def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
         inv_freq = _create_inv_freq(dim, base, device)
@@ -318,10 +416,6 @@ class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
             self._sin_cached = torch.sin(freqs).to(dtype)
 
 
-# Inverse dim formula to find dim based on number of rotations
-import math
-
-
 def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
     return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
         2 * math.log(base)
@@ -346,10 +440,10 @@ def linear_ramp_mask(min, max, dim):
     return ramp_func
 
 
-def get_mscale(scale=1):
+def get_mscale(scale: float = 1.0, mscale: float = 1.0):
     if scale <= 1:
         return 1.0
-    return 0.1 * math.log(scale) + 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
 
 
 class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
@@ -365,6 +459,8 @@ class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
         attn_factor,
         beta_fast,
         beta_slow,
+        mscale: float,
+        mscale_all_dim: float,
     ):
         inv_freq = _create_inv_freq(dim, base, device)
         super().__init__(inv_freq, scaling_factor)
@@ -375,8 +471,12 @@ class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
         self.attn_factor = attn_factor
         self.beta_fast = beta_fast
         self.beta_slow = beta_slow
+        self.mscale_all_dim = mscale_all_dim
+        self.scaling_factor = scaling_factor
         self.mscale = float(
-            get_mscale(self.scaling_factor) * self.attn_factor
+            get_mscale(self.scaling_factor, mscale)
+            / get_mscale(self.scaling_factor, mscale_all_dim)
+            * self.attn_factor
         )  # Get n-d magnitude scaling corrected for interpolation
 
     def _update_cos_sin_cache(self, dtype, device, seqlen):
@@ -387,7 +487,7 @@ class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
             or self._cos_cached.device != device
             or self._cos_cached.dtype != dtype
         ):
-            if seqlen > self.max_position_embeddings:
+            if seqlen > self.max_position_embeddings or True:
                 inv_freq_extrapolation = _create_inv_freq(
                     self.dim, self.base, self.inv_freq.device
                 )
@@ -400,6 +500,7 @@ class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
                     self.base,
                     self.max_position_embeddings,
                 )
+
                 inv_freq_mask = (
                     1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
                 ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
@@ -409,9 +510,6 @@ class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
                 )
 
                 self.inv_freq = inv_freq
-                self.mscale = float(
-                    get_mscale(self.scaling_factor) * self.attn_factor
-                )  # Get n-d magnitude scaling corrected for interpolation
 
             self._seq_len_cached = seqlen
             t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
@@ -421,3 +519,32 @@ class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
             freqs = torch.outer(t, self.inv_freq.to(device=t.device))
             self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
             self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
+
+
+def apply_llama3_scaling(
+    freqs: torch.Tensor,
+    *,
+    scaling_factor: int,
+    low_freq_factor: int,
+    high_freq_factor: int,
+    original_max_position_embeddings: int,
+):
+    low_freq_wavelen = original_max_position_embeddings / low_freq_factor
+    high_freq_wavelen = original_max_position_embeddings / high_freq_factor
+    new_freqs = []
+
+    for freq in freqs:
+        wavelen = 2 * math.pi / freq
+
+        if wavelen < high_freq_wavelen:
+            new_freqs.append(freq)
+        elif wavelen > low_freq_wavelen:
+            new_freqs.append(freq / scaling_factor)
+        else:
+            assert low_freq_wavelen != high_freq_wavelen
+            smooth = (original_max_position_embeddings / wavelen - low_freq_factor) / (
+                high_freq_factor - low_freq_factor
+            )
+            new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
+
+    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
diff --git a/server/text_generation_server/layers/speculative.py b/server/text_generation_server/layers/speculative.py
index 4b977a56a8f4822e9c8adfd65b18f25b83050a53..cf8469b53d025dec5fcfb52ea5bac3ff4fb68d93 100644
--- a/server/text_generation_server/layers/speculative.py
+++ b/server/text_generation_server/layers/speculative.py
@@ -33,7 +33,7 @@ class SpeculativeHead(torch.nn.Module):
             except KeyError:
                 try:
                     speculator = MedusaHeadV1.load(config, prefix, weights)
-                except:
+                except Exception:
                     speculator = MedusaHeadV2(config, prefix, weights)
             lm_head = None
         else:
diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py
index 038de25815a761c4e4ddf5ff8f72685295cbd6d7..13f12ef1ec1ed1c8a66b1d9d36506b1c4a32a238 100644
--- a/server/text_generation_server/layers/tensor_parallel.py
+++ b/server/text_generation_server/layers/tensor_parallel.py
@@ -2,7 +2,6 @@ import torch
 from torch.nn import functional as F
 from typing import Iterable, List
 from text_generation_server.layers.linear import get_linear, FastLinear
-from text_generation_server.layers.exl2 import Exl2Weight
 from text_generation_server.utils.import_utils import SYSTEM
 
 if SYSTEM == "ipex":
@@ -50,9 +49,9 @@ class TensorParallelHead(SuperLayer):
                 # If the piece and LM head embeddings are shared, we have
                 # non-quantized weights...
                 weight = weights.get_tensor(f"{prefix}.weight")
-            except:
+            except Exception:
                 # ...otherwise they are quantized.
-                weight = weights.get_weights_col(prefix, config.quantize)
+                weight = weights.get_weights_col(prefix)
             should_gather = weights.process_group.size() > 1
         elif weights.process_group.size() > 1:
             try:
@@ -67,17 +66,8 @@ class TensorParallelHead(SuperLayer):
             weight = weights.get_tensor(f"{prefix}.weight")
             should_gather = False
 
-        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
-        if config.quantize in ["gptq", "awq", "eetq", "marlin"]:
-            quantize = None
-        # See above, exl2 LM head can be quantized or not.
-        elif config.quantize == "exl2" and not isinstance(weight, Exl2Weight):
-            quantize = None
-        else:
-            quantize = config.quantize
-
         return TensorParallelHead(
-            get_linear(weight, bias=None, quantize=quantize),
+            get_linear(weight, bias=None),
             process_group=weights.process_group,
             should_gather=should_gather,
         )
@@ -129,14 +119,12 @@ class TensorParallelColumnLinear(SuperLayer):
     @classmethod
     def load_gate_up(cls, config, prefix: str, weights, bias: bool):
         """Specific method when the QKV was joined after the fact"""
-        weight = weights.get_weights_col_packed_gate_up(
-            prefix, quantize=config.quantize
-        )
+        weight = weights.get_weights_col_packed_gate_up(prefix)
         if bias:
             raise NotImplementedError("packed_gate_up only implemented without bias")
         else:
             bias = None
-        linear = get_linear(weight, bias, config.quantize)
+        linear = get_linear(weight, bias)
         return cls(linear)
 
     @classmethod
@@ -152,7 +140,6 @@ class TensorParallelColumnLinear(SuperLayer):
         """Specific method when the QKV was joined after the fact"""
         weight = weights.get_weights_col_packed_qkv(
             prefix,
-            quantize=config.quantize,
             num_heads=num_heads,
             num_key_value_heads=num_key_value_heads,
         )
@@ -160,17 +147,17 @@ class TensorParallelColumnLinear(SuperLayer):
             raise NotImplementedError("packed_qkv only implemented for baichuan")
         else:
             bias = None
-        linear = get_linear(weight, bias, config.quantize)
+        linear = get_linear(weight, bias)
         return cls(linear)
 
     @classmethod
     def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_weights_col(prefix, config.quantize)
+        weight = weights.get_weights_col(prefix)
         if bias:
             bias = weights.get_sharded(f"{prefix}.bias", dim=0)
         else:
             bias = None
-        linear = get_linear(weight, bias, config.quantize)
+        linear = get_linear(weight, bias)
         return cls(linear)
 
     @classmethod
@@ -178,20 +165,18 @@ class TensorParallelColumnLinear(SuperLayer):
         if config.quantize == "exl2":
             linears = []
             for prefix in prefixes:
-                weight = weights.get_weights_col(prefix, config.quantize)
+                weight = weights.get_weights_col(prefix)
                 b = weights.get_tensor(f"{prefix}.bias") if bias else None
-                linears.append(get_linear(weight, b, config.quantize))
+                linears.append(get_linear(weight, b))
             linear = LayerConcat(linears)
         else:
-            weight = weights.get_multi_weights_col(
-                prefixes, quantize=config.quantize, dim=dim
-            )
+            weight = weights.get_multi_weights_col(prefixes, dim=dim)
             if bias:
                 b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
                 bias = torch.cat(b, dim=dim)
             else:
                 bias = None
-            linear = get_linear(weight, bias, config.quantize)
+            linear = get_linear(weight, bias)
         return cls(linear)
 
 
@@ -202,7 +187,7 @@ class TensorParallelRowLinear(SuperLayer):
 
     @classmethod
     def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+        weight = weights.get_weights_row(prefix)
 
         if bias and weights.process_group.rank() == 0:
             # Rank is only on the first rank process
@@ -210,7 +195,7 @@ class TensorParallelRowLinear(SuperLayer):
         else:
             bias = None
         return cls(
-            get_linear(weight, bias, config.quantize),
+            get_linear(weight, bias),
             process_group=weights.process_group,
         )
 
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 5ea432909c86507caeff84ca3f4c826b7b6fb59e..f4fa431c30e64ba8008bba8255667ade889bc55c 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -1,3 +1,6 @@
+# ruff: noqa: F821
+# the above line disables the `undefined-name` rule for the model type variables
+
 import torch
 import enum
 import os
@@ -6,24 +9,48 @@ from loguru import logger
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import modeling_auto
 from huggingface_hub import hf_hub_download, HfApi
-from typing import Optional, List
+from typing import Optional, List, Dict
 from pathlib import Path
 
 from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
-from text_generation_server.models.causal_lm import CausalLM
-from text_generation_server.models.bloom import BLOOMSharded
-from text_generation_server.models.mpt import MPTSharded
+from text_generation_server.models.causal_lm import CausalLM, CausalLMBatchKeysLast
+from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
+from text_generation_server.models.custom_modeling.mpt_modeling import (
+    MPTForCausalLM,
+)
+from text_generation_server.models.bloom import BloomCausalLMBatch
+from text_generation_server.models.custom_modeling.bloom_modeling import (
+    BloomForCausalLM,
+)
 from text_generation_server.models.seq2seq_lm import Seq2SeqLM
-from text_generation_server.models.rw import RW
-from text_generation_server.models.opt import OPTSharded
-from text_generation_server.models.galactica import GalacticaSharded
-from text_generation_server.models.santacoder import SantaCoder
-from text_generation_server.models.t5 import T5Sharded
-from text_generation_server.models.gpt_neox import GPTNeoxSharded
-from text_generation_server.models.phi import Phi
+from text_generation_server.models.galactica import GalacticaCausalLMBatch
+from text_generation_server.models.custom_modeling.neox_modeling import (
+    GPTNeoxForCausalLM,
+)
+from text_generation_server.models.custom_modeling.phi_modeling import (
+    PhiConfig,
+    PhiForCausalLM,
+)
+from text_generation_server.models.custom_modeling.flash_phi_moe_modeling import (
+    PhiMoEConfig,
+)
+from text_generation_server.models.custom_modeling.t5_modeling import (
+    T5ForConditionalGeneration,
+)
+
+
+from text_generation_server.utils.adapter import (
+    AdapterParameters,
+    build_layer_weight_lookup,
+    load_and_merge_adapters,
+    AdapterInfo,
+)
+from text_generation_server.adapters.lora import LoraWeights
+
 
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.log import log_master
 
 # The flag below controls whether to allow TF32 on matmul. This flag defaults to False
 # in PyTorch 1.12 and later.
@@ -37,14 +64,9 @@ torch.set_grad_enabled(False)
 
 __all__ = [
     "Model",
-    "BLOOMSharded",
     "CausalLM",
-    "GalacticaSharded",
     "Seq2SeqLM",
-    "SantaCoder",
-    "OPTSharded",
-    "T5Sharded",
-    "get_model",
+    "get_model_with_lora_adapters",
 ]
 
 FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."
@@ -53,67 +75,92 @@ FLASH_ATTENTION = True
 
 try:
     from text_generation_server.models.flash_causal_lm import FlashCausalLM
-    from text_generation_server.models.flash_rw import FlashRWSharded
-    from text_generation_server.models.flash_gpt2 import FlashGPT2
-    from text_generation_server.models.flash_neox import FlashNeoXSharded
-    from text_generation_server.models.flash_llama import (
-        FlashLlama,
+    from text_generation_server.models.vlm_causal_lm import VlmCausalLM
+    from text_generation_server.models.mllama_causal_lm import MllamaCausalLM
+    from text_generation_server.models.custom_modeling.flash_deepseek_v2_modeling import (
+        FlashDeepseekV2ForCausalLM,
+        DeepseekV2Config,
     )
-    from text_generation_server.models.flash_qwen2 import (
-        FlashQwen2,
+    from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+        FlashLlamaForCausalLM,
     )
-    from text_generation_server.models.flash_cohere import (
-        FlashCohere,
+    from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
+        FlashCohereForCausalLM,
     )
-    from text_generation_server.models.flash_gemma import (
-        FlashGemma,
+    from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
+        FlashGemmaForCausalLM,
     )
-    from text_generation_server.models.flash_gemma2 import (
-        FlashGemma2,
+    from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
+        FlashGemma2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
+        FlashDbrxForCausalLM,
+        DbrxConfig,
+    )
+    from text_generation_server.models.custom_modeling.flash_rw_modeling import (
+        RWConfig,
+        FlashRWForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_neox_modeling import (
+        FlashGPTNeoXForCausalLM,
     )
     from text_generation_server.models.pali_gemma import (
-        PaliGemma,
+        PaliGemmaBatch,
+    )
+    from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import (
+        PaliGemmaForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.flash_phi_modeling import (
+        FlashPhiForCausalLM,
+    )
+    from text_generation_server.models.idefics_causal_lm import IdeficsCausalLM
+    from text_generation_server.models.mllama_causal_lm import MllamaCausalLMBatch
+    from text_generation_server.models.custom_modeling.mllama import (
+        MllamaForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.llava_next import (
+        LlavaNextForConditionalGeneration,
+    )
+
+    from text_generation_server.models.custom_modeling.flash_santacoder_modeling import (
+        FlashSantacoderForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_starcoder2_modeling import (
+        FlashStarcoder2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
+        Qwen2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
+        FlashMistralForCausalLM,
     )
-    from text_generation_server.models.flash_santacoder import (
-        FlashSantacoderSharded,
+    from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
+        FlashMixtralForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
+        FlashGPT2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gptj_modeling import (
+        FlashGPTJForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.idefics2 import (
+        Idefics2ForConditionalGeneration,
     )
-    from text_generation_server.models.idefics import IDEFICSSharded
-    from text_generation_server.models.llava_next import LlavaNext
-    from text_generation_server.models.idefics2 import Idefics2
-    from text_generation_server.models.flash_mistral import FlashMistral
-    from text_generation_server.models.flash_mixtral import FlashMixtral
-    from text_generation_server.models.flash_phi import FlashPhi
-    from text_generation_server.models.flash_starcoder2 import FlashStarcoder2
-    from text_generation_server.models.flash_dbrx import FlashDbrx
     from text_generation_server.layers.attention import SUPPORTS_WINDOWING
 except ImportError as e:
-    logger.warning(f"Could not import Flash Attention enabled models: {e}")
+    log_master(logger.warning, f"Could not import Flash Attention enabled models: {e}")
     SUPPORTS_WINDOWING = False
     FLASH_ATTENTION = False
 
 if FLASH_ATTENTION:
     __all__.append(FlashCausalLM)
-    __all__.append(FlashGPT2)
-    __all__.append(FlashNeoXSharded)
-    __all__.append(FlashRWSharded)
-    __all__.append(FlashSantacoderSharded)
-    __all__.append(FlashLlama)
-    __all__.append(IDEFICSSharded)
-    __all__.append(FlashMistral)
-    __all__.append(FlashMixtral)
-    __all__.append(FlashDbrx)
-    __all__.append(FlashPhi)
-    __all__.append(FlashQwen2)
-    __all__.append(FlashStarcoder2)
-    __all__.append(FlashGemma)
-    __all__.append(FlashGemma2)
-    __all__.append(FlashCohere)
+    __all__.append(IdeficsCausalLM)
 
 MAMBA_AVAILABLE = True
 try:
     from text_generation_server.models.mamba import Mamba
 except ImportError as e:
-    logger.warning(f"Could not import Mamba: {e}")
+    log_master(logger.warning, f"Could not import Mamba: {e}")
     MAMBA_AVAILABLE = False
 
 if MAMBA_AVAILABLE:
@@ -121,6 +168,11 @@ if MAMBA_AVAILABLE:
 
 
 class ModelType(enum.Enum):
+    DEEPSEEK_V2 = {
+        "type": "deepseek_v2",
+        "name": "Deepseek V2",
+        "url": "https://huggingface.co/deepseek-ai/DeepSeek-V2",
+    }
     IDEFICS2 = {
         "type": "idefics2",
         "name": "Idefics 2",
@@ -136,22 +188,32 @@ class ModelType(enum.Enum):
     LLAMA = {
         "type": "llama",
         "name": "Llama",
-        "url": "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct",
+        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
     }
     PHI3 = {
         "type": "phi3",
         "name": "Phi 3",
         "url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
     }
+    GRANITE = {
+        "type": "granite",
+        "name": "Granite",
+        "url": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct",
+    }
     GEMMA = {
         "type": "gemma",
         "name": "Gemma",
         "url": "https://huggingface.co/google/gemma-7b",
     }
+    PALIGEMMA = {
+        "type": "paligemma",
+        "name": "PaliGemma",
+        "url": "https://huggingface.co/google/paligemma-3b-pt-224",
+    }
     GEMMA2 = {
         "type": "gemma2",
         "name": "Gemma2",
-        "url": "https://huggingface.co/google/gemma2-9b",
+        "url": "https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315",
     }
     COHERE = {
         "type": "cohere",
@@ -171,7 +233,7 @@ class ModelType(enum.Enum):
     MISTRAL = {
         "type": "mistral",
         "name": "Mistral",
-        "url": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2",
+        "url": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
     }
     MIXTRAL = {
         "type": "mixtral",
@@ -188,6 +250,11 @@ class ModelType(enum.Enum):
         "name": "Phi",
         "url": "https://huggingface.co/microsoft/phi-1_5",
     }
+    PHI_MOE = {
+        "type": "phimoe",
+        "name": "PhiMoe",
+        "url": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
+    }
     BAICHUAN = {
         "type": "baichuan",
         "name": "Baichuan",
@@ -248,12 +315,23 @@ class ModelType(enum.Enum):
         "name": "Gpt Neox",
         "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
     }
+    GPTJ = {
+        "type": "gptj",
+        "name": "Gptj",
+        "url": "https://huggingface.co/EleutherAI/gpt-j-6b",
+    }
     IDEFICS = {
         "type": "idefics",
         "name": "Idefics",
         "url": "https://huggingface.co/HuggingFaceM4/idefics-9b",
         "multimodal": True,
     }
+    MLLAMA = {
+        "type": "mllama",
+        "name": "Mllama",
+        "url": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
+        "multimodal": True,
+    }
 
 
 __GLOBALS = locals()
@@ -269,14 +347,69 @@ def get_model(
     quantize: Optional[str],
     speculate: Optional[int],
     dtype: Optional[str],
+    kv_cache_dtype: Optional[str],
     trust_remote_code: bool,
     max_input_tokens: int,
 ) -> Model:
     global FLASH_ATTENTION
+
+    config_dict, _ = PretrainedConfig.get_config_dict(
+        model_id, revision=revision, trust_remote_code=trust_remote_code
+    )
+    model_type = config_dict.get("model_type", None)
+
+    quantization_config = config_dict.get("quantization_config", None)
+    compression_config = config_dict.get("compression_config", None)
+    if quantization_config is not None and quantize is None:
+        method = quantization_config.get("quant_method", None)
+        config_groups = quantization_config.get("config_groups", None)
+        if method in {"gptq", "awq", "exl2"}:
+            log_master(logger.info, f"Auto selecting quantization method {method}")
+            quantize = method
+        elif method == "fbgemm_fp8" or method == "fp8":
+            log_master(logger.info, "Auto selecting quantization method fp8")
+            quantize = "fp8"
+        elif config_groups is not None:
+            # TODO: at some point we should probably fully parse the compression
+            # configuration to know which parameters are compressed.
+            for _, group in config_groups.items():
+                weights_config = group.get("weights")
+                if weights_config is not None:
+                    if (
+                        weights_config["type"] == "float"
+                        and weights_config["num_bits"] == 8
+                    ):
+                        log_master(
+                            logger.info, "Auto selecting quantization method fp8"
+                        )
+                        quantize = "fp8"
+                        break
+        else:
+            log_master(logger.warning, f"Unknown quantization method {method}")
+    elif compression_config is not None:
+        # `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
+        config_groups = compression_config.get("config_groups")
+        if config_groups is not None:
+            for _, group in config_groups.items():
+                weights_config = group.get("weights")
+                if weights_config is not None:
+                    if (
+                        weights_config["type"] == "float"
+                        and weights_config["num_bits"] == 8
+                    ):
+                        log_master(
+                            logger.info, "Auto selecting quantization method fp8"
+                        )
+                        quantize = "fp8"
+                        break
+
     if dtype is None:
         if quantize in ["awq", "exl2", "gptq", "marlin"]:
-            # These quantizers only work with float16 params.
-            dtype = torch.float16
+            if SYSTEM == "ipex" and not hasattr(torch, "xpu"):
+                dtype = torch.bfloat16
+            else:
+                # These quantizers only work with float16 params.
+                dtype = torch.float16
         else:
             # Keep it as default for now and let
             # every model resolve their own default dtype.
@@ -288,16 +421,20 @@ def get_model(
     else:
         raise RuntimeError(f"Unknown dtype {dtype}")
 
+    if kv_cache_dtype is None:
+        kv_cache_dtype = dtype
+    elif kv_cache_dtype == "fp8_e4m3fn":
+        kv_cache_dtype = torch.float8_e4m3fn
+    elif kv_cache_dtype == "fp8_e5m2":
+        kv_cache_dtype = torch.float8_e5m2
+    else:
+        raise RuntimeError(f"Unknown kv_cache_dtype: {kv_cache_dtype}")
+
     if speculate is not None:
         set_speculate(speculate)
     else:
         set_speculate(0)
 
-    config_dict, _ = PretrainedConfig.get_config_dict(
-        model_id, revision=revision, trust_remote_code=trust_remote_code
-    )
-    model_type = config_dict.get("model_type", None)
-
     speculator = None
     if "medusa_num_heads" in config_dict:
         medusa_model_id = model_id
@@ -385,6 +522,11 @@ def get_model(
                     revision=mlp_revision,
                     filename=filename,
                 )
+            speculator_dir_path = Path(mlp_speculator_config).parent
+            # if these are downloaded, they get converted to safetensors
+            filenames.extend(
+                [p for p in os.listdir(speculator_dir_path) if p.endswith(extension)]
+            )
             speculator = {
                 "path": Path(mlp_speculator_config).parent,
                 "model_paths": filenames,
@@ -399,7 +541,9 @@ def get_model(
 
     speculate = get_speculate()
     if speculate > 0:
-        logger.info(f"Using speculation {method} with {speculate} input ids.")
+        log_master(
+            logger.info, f"Using speculation {method} with {speculate} input ids."
+        )
 
     if model_type is None:
         # TODO: fix how we determine model type for Mamba
@@ -410,31 +554,62 @@ def get_model(
             raise RuntimeError(
                 f"Could not determine model type for {model_id} revision {revision}"
             )
-    quantization_config = config_dict.get("quantization_config", None)
-    if quantization_config is not None and quantize is None:
-        method = quantization_config.get("quant_method", None)
-        if method in {"gptq", "awq", "exl2"}:
-            logger.info(f"Auto selecting quantization method {method}")
-            quantize = method
-        else:
-            logger.info(f"Unknown quantization method {method}")
 
     if quantize == "exl2" and sharded:
         raise RuntimeError(
             "Sharding is currently not supported with `exl2` quantization"
         )
-    sliding_window = config_dict.get("sliding_window", -1)
 
-    if (
-        (sliding_window is not None and sliding_window != -1)
-        and not SUPPORTS_WINDOWING
-        and max_input_tokens > sliding_window
-    ):
+    sliding_window = (
+        config_dict.get("sliding_window")
+        if config_dict.get("sliding_window") is not None
+        else -1
+    )
+
+    use_sliding_window = sliding_window is not None and sliding_window != -1
+    needs_sliding_window = (
+        max_input_tokens is not None and max_input_tokens > sliding_window
+    )
+    if use_sliding_window and needs_sliding_window and not SUPPORTS_WINDOWING:
         raise ValueError(
             f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})."
         )
 
-    if model_type == MAMBA:
+    if model_type == DEEPSEEK_V2:
+        if FLASH_ATTENTION:
+            head_size = max(
+                config_dict.get("qk_nope_dim", 128)
+                + config_dict.get("qk_rope_dim", 64),
+                config_dict.get("v_head_dim", 128),
+            )
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashDeepseekV2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                default_dtype=torch.bfloat16,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=DeepseekV2Config,
+                head_size=head_size,
+            )
+        elif sharded:
+            raise NotImplementedError(
+                FLASH_ATT_ERROR_MESSAGE.format("Sharded Deepseek V2")
+            )
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+    elif model_type == MAMBA:
         return Mamba(
             model_id,
             revision,
@@ -445,13 +620,16 @@ def get_model(
         )
 
     if model_id.startswith("facebook/galactica"):
-        return GalacticaSharded(
-            model_id,
-            revision,
+        return CausalLM(
+            model_id=model_id,
+            # Yes galactica is just an OPT model.
+            model_class=OPTForCausalLM,
+            revision=revision,
             quantize=quantize,
             speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
+            batch_class=GalacticaCausalLMBatch,
         )
 
     if (
@@ -460,22 +638,27 @@ def get_model(
         and model_id.startswith("bigcode/")
     ):
         if FLASH_ATTENTION:
-            return FlashSantacoderSharded(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashSantacoderForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                aliases={"transformer.wte.weight": ["lm_head.weight"]},
+                num_kv_heads=1,
             )
         elif sharded:
             raise NotImplementedError(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Santacoder")
             )
         else:
-            return SantaCoder(
-                model_id,
-                revision,
+            return CausalLM.fallback(
+                model_id=model_id,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
@@ -483,38 +666,45 @@ def get_model(
             )
 
     if model_type == BLOOM:
-        return BLOOMSharded(
-            model_id,
-            revision,
+        return CausalLM(
+            model_id=model_id,
+            model_class=BloomForCausalLM,
+            revision=revision,
             quantize=quantize,
             speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
+            batch_class=BloomCausalLMBatch,
         )
     elif model_type == MPT:
-        return MPTSharded(
-            model_id,
-            revision,
+        return CausalLM(
+            model_id=model_id,
+            model_class=MPTForCausalLM,
+            revision=revision,
             quantize=quantize,
             speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
+            batch_class=CausalLMBatchKeysLast,
         )
     elif model_type == GPT2:
         if FLASH_ATTENTION:
             try:
-                return FlashGPT2(
-                    model_id,
-                    revision,
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashGPT2ForCausalLM,
+                    revision=revision,
                     quantize=quantize,
                     speculator=speculator,
                     dtype=dtype,
+                    kv_cache_dtype=kv_cache_dtype,
                     trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
                 )
             except RuntimeError as e:
                 # Lots of legacy models with various weight names.
-                logger.warning(f"Couldn't load flash gpt2 variant: {e}")
-                return CausalLM(
+                log_master(logger.warning, f"Couldn't load flash gpt2 variant: {e}")
+                return CausalLM.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -525,7 +715,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-2"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -533,9 +723,35 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
-    elif model_type == GPT_NEOX:
+    elif model_type == GPTJ:
         if FLASH_ATTENTION:
-            return FlashNeoXSharded(
+            try:
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashGPTJForCausalLM,
+                    revision=revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    kv_cache_dtype=kv_cache_dtype,
+                    trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                )
+            except RuntimeError as e:
+                # Lots of legacy models with various weight names.
+                log_master(logger.warning, f"Couldn't load flash gptj variant: {e}")
+                return CausalLM.fallback(
+                    model_id,
+                    revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-J"))
+        else:
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -543,17 +759,36 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+    elif model_type == GPT_NEOX:
+        if FLASH_ATTENTION:
+            from text_generation_server.models.custom_modeling.flash_neox_modeling import (
+                GPTNeoXConfig,
+            )
+
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGPTNeoXForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=GPTNeoXConfig,
+            )
         elif sharded:
-            return GPTNeoxSharded(
-                model_id,
-                revision,
+            return CausalLM(
+                model_id=model_id,
+                model_class=GPTNeoxForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -564,7 +799,19 @@ def get_model(
 
     elif model_type == PHI:
         if FLASH_ATTENTION:
-            return FlashPhi(
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashPhiForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        else:
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -572,8 +819,23 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+
+    elif model_type == PHI_MOE:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashLlamaForCausalLM,
+                config_class=PhiMoEConfig,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -588,30 +850,41 @@ def get_model(
                 "Legacy phi-msft is not supported with Flash Attention"
             )
         else:
-            return Phi(
-                model_id,
-                revision,
+            return CausalLM(
+                model_id=model_id,
+                model_class=PhiForCausalLM,
+                config_class=PhiConfig,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
 
-    elif model_type == LLAMA or model_type == BAICHUAN or model_type == PHI3:
+    elif (
+        model_type == LLAMA
+        or model_type == BAICHUAN
+        or model_type == PHI3
+        or model_type == GRANITE
+    ):
         if FLASH_ATTENTION:
-            return FlashLlama(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashLlamaForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
-            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Llama"))
+            raise NotImplementedError(
+                FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
+            )
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -621,18 +894,23 @@ def get_model(
             )
     if model_type == GEMMA:
         if FLASH_ATTENTION:
-            return FlashGemma(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGemmaForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -642,18 +920,23 @@ def get_model(
             )
     elif model_type == GEMMA2:
         if FLASH_ATTENTION:
-            return FlashGemma2(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGemma2ForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -664,18 +947,21 @@ def get_model(
 
     if model_type == COHERE:
         if FLASH_ATTENTION:
-            return FlashCohere(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashCohereForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Cohere"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -686,18 +972,24 @@ def get_model(
 
     if model_type == DBRX:
         if FLASH_ATTENTION:
-            return FlashDbrx(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashDbrxForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Dbrx works better in bfloat16.
+                default_dtype=torch.bfloat16,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=DbrxConfig,
             )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded DBRX"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -711,27 +1003,43 @@ def get_model(
             if FLASH_ATTENTION:
                 if config_dict.get("alibi", False):
                     raise NotImplementedError("sharded is not supported for this model")
-                return FlashRWSharded(
-                    model_id,
-                    revision,
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashRWForCausalLM,
+                    revision=revision,
                     quantize=quantize,
                     speculator=speculator,
                     dtype=dtype,
+                    kv_cache_dtype=kv_cache_dtype,
+                    aliases={
+                        "lm_head.weight": ["transformer.word_embeddings.weight"],
+                        "transformer.word_embeddings.weight": ["lm_head.weight"],
+                    },
                     trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                    config_class=RWConfig,
                 )
-            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format(f"Sharded Falcon"))
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Falcon"))
         else:
             if FLASH_ATTENTION and not config_dict.get("alibi", False):
-                return FlashRWSharded(
-                    model_id,
-                    revision,
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashRWForCausalLM,
+                    revision=revision,
                     quantize=quantize,
                     speculator=speculator,
                     dtype=dtype,
+                    kv_cache_dtype=kv_cache_dtype,
+                    aliases={
+                        "lm_head.weight": ["transformer.word_embeddings.weight"],
+                        "transformer.word_embeddings.weight": ["lm_head.weight"],
+                    },
                     trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                    config_class=RWConfig,
                 )
             else:
-                return RW(
+                return CausalLM.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -742,18 +1050,21 @@ def get_model(
 
     if model_type == MISTRAL:
         if FLASH_ATTENTION:
-            return FlashMistral(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashMistralForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mistral"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -764,18 +1075,21 @@ def get_model(
 
     if model_type == MIXTRAL:
         if FLASH_ATTENTION:
-            return FlashMixtral(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashMixtralForCausalLM,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mixtral"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -786,19 +1100,23 @@ def get_model(
 
     if model_type == STARCODER2:
         if FLASH_ATTENTION:
-            return FlashStarcoder2(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashStarcoder2ForCausalLM,
+                revision=revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
             raise NotImplementedError(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Starcoder2")
             )
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -809,17 +1127,21 @@ def get_model(
 
     if model_type == QWEN2:
         if FLASH_ATTENTION:
-            return FlashQwen2(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=Qwen2ForCausalLM,
+                revision=revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Qwen2"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -829,9 +1151,10 @@ def get_model(
             )
 
     if model_type == OPT:
-        return OPTSharded(
-            model_id,
-            revision,
+        return CausalLM(
+            model_id=model_id,
+            model_class=OPTForCausalLM,
+            revision=revision,
             quantize=quantize,
             speculator=speculator,
             dtype=dtype,
@@ -839,17 +1162,24 @@ def get_model(
         )
 
     if model_type == T5:
-        return T5Sharded(
-            model_id,
-            revision,
+        return Seq2SeqLM(
+            model_id=model_id,
+            model_class=T5ForConditionalGeneration,
+            revision=revision,
             quantize=quantize,
             speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
+            aliases={
+                "shared.weight": [
+                    "encoder.embed_tokens.weight",
+                    "decoder.embed_tokens.weight",
+                ]
+            },
         )
     if model_type == IDEFICS:
         if FLASH_ATTENTION:
-            return IDEFICSSharded(
+            return IdeficsCausalLM(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -859,39 +1189,69 @@ def get_model(
             )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
+    if model_type == MLLAMA:
+        if FLASH_ATTENTION:
+            return MllamaCausalLM(
+                model_id=model_id,
+                model_class=MllamaForConditionalGeneration,
+                batch_class=MllamaCausalLMBatch,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Mllama"))
     if model_type == IDEFICS2:
         if FLASH_ATTENTION:
-            return Idefics2(
-                model_id,
-                revision,
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=Idefics2ForConditionalGeneration,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                # XXX: Extremely important to cap resolution in order to limit
+                # VRAM usage.
+                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
             )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
-    if model_type == "paligemma":
+    if model_type == PALIGEMMA:
         if FLASH_ATTENTION:
-            return PaliGemma(
-                model_id,
-                revision,
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=PaliGemmaForConditionalGeneration,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                batch_class=PaliGemmaBatch,
             )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
 
     if model_type == LLAVA_NEXT:
         if FLASH_ATTENTION:
-            return LlavaNext(
-                model_id,
-                revision,
+            return VlmCausalLM(
+                model_class=LlavaNextForConditionalGeneration,
+                model_id=model_id,
+                revision=revision,
                 quantize=quantize,
                 speculator=speculator,
                 dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
             )
         else:
@@ -912,7 +1272,7 @@ def get_model(
     elif quantize == "exl2":
         raise NotImplementedError("exl2 quantization is not supported for AutoModel")
     if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-        return CausalLM(
+        return CausalLM.fallback(
             model_id,
             revision,
             quantize=quantize,
@@ -921,7 +1281,7 @@ def get_model(
             trust_remote_code=trust_remote_code,
         )
     if model_type in modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
-        return Seq2SeqLM(
+        return Seq2SeqLM.fallback(
             model_id,
             revision,
             quantize=quantize,
@@ -933,7 +1293,7 @@ def get_model(
     auto_map = config_dict.get("auto_map", None)
     if trust_remote_code and auto_map is not None:
         if "AutoModelForCausalLM" in auto_map.keys():
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -942,7 +1302,7 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
         if "AutoModelForSeq2SeqLM" in auto_map.keys():
-            return Seq2SeqLM(
+            return Seq2SeqLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -952,3 +1312,119 @@ def get_model(
             )
 
     raise ValueError(f"Unsupported model type {model_type}")
+
+
+# get_model_with_lora_adapters wraps the internal get_model function and adds support for loading adapters
+# this provides a post model loading hook to load adapters into the model after the model has been loaded
+def get_model_with_lora_adapters(
+    model_id: str,
+    lora_adapters: Optional[List[AdapterInfo]],
+    revision: Optional[str],
+    sharded: bool,
+    quantize: Optional[str],
+    speculate: Optional[int],
+    dtype: Optional[str],
+    kv_cache_dtype: Optional[str],
+    trust_remote_code: bool,
+    max_input_tokens: int,
+    adapter_to_index: Dict[str, int],
+):
+    lora_adapter_ids = [adapter.id for adapter in lora_adapters]
+    model = get_model(
+        model_id,
+        lora_adapter_ids,
+        revision,
+        sharded,
+        quantize,
+        speculate,
+        dtype,
+        kv_cache_dtype,
+        trust_remote_code,
+        max_input_tokens,
+    )
+
+    if len(lora_adapters) > 0:
+        target_to_layer = build_layer_weight_lookup(model.model)
+
+        for index, adapter in enumerate(lora_adapters):
+            # The AdapterParameters object allows for merging multiple adapters into a single adapter.
+            # At the moment, we only support loading a single adapter into the model, but we keep the
+            # AdapterParameters object for easier extension in the future.
+            adapter_parameters = AdapterParameters(
+                adapter_info=[adapter],
+                # when merging multiple adapters we can weight them differently
+                # if this is not set, all adapters will be weighted equally
+                # see: text_generation_server.utils.merges.strategies for impl
+                weights=None,
+                merge_strategy=0,
+                density=1.0,
+                majority_sign_method=0,
+            )
+
+            adapter_index = index + 1
+            adapter_to_index[adapter.id] = adapter_index
+
+            logger.info(
+                f"Loading adapter weights into model: {','.join([adapter.id for adapter in adapter_parameters.adapter_info])}"
+            )
+            weight_names = tuple([v[0] for v in target_to_layer.values()])
+            (
+                module_map,
+                adapter_config,
+                adapter_weight_names,
+                adapter_tokenizer,
+            ) = load_and_merge_adapters(
+                model.model_id,
+                adapter_parameters,
+                adapter_index,
+                weight_names,
+                False,
+            )
+
+            unused_weight_names = adapter_weight_names.copy()
+
+            adapter_layers = [
+                "q_proj",
+                "k_proj",
+                "v_proj",
+                "o_proj",
+                "gate_proj",
+                "up_proj",
+                "down_proj",
+                "qkv_proj",
+            ]
+
+            for layer_name in adapter_layers:
+                nlayers = (
+                    1 if layer_name == "lm_head" else len(model.model.model.layers)
+                )
+                adapter_weights = LoraWeights.prepare_weights(
+                    config=adapter_config,
+                    module_map=module_map,
+                    layer_type=layer_name,
+                    unused_weight_names=unused_weight_names,
+                    nlayers=nlayers,
+                    dtype=model.dtype,
+                    world_size=model.world_size,
+                    process_group=model.process_group,
+                    target_to_layer=target_to_layer,
+                )
+
+                if adapter_weights is None:
+                    continue
+
+                model.layer_to_adapter_weights[layer_name].add_adapter(
+                    adapter_index, adapter_weights
+                )
+
+            if len(unused_weight_names) > 0:
+                logger.warning(
+                    f"{','.join([a.id for a in lora_adapters])} unused adapter weights: {unused_weight_names}"
+                )
+
+            if adapter_tokenizer is not None:
+                model.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)
+
+            model.loaded_adapters.add(adapter_index)
+
+    return model
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 17aa12e84dc6799baf9886fe894084d353d315ca..732b4c5394cfed2bcd9f1955450801e28fad7541 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -4,22 +4,12 @@ import torch.distributed
 from typing import Optional, Type
 
 from transformers import (
-    AutoTokenizer,
-    AutoConfig,
     PreTrainedTokenizerBase,
 )
 
-from text_generation_server.models.custom_modeling.bloom_modeling import (
-    BloomForCausalLM,
-)
 from text_generation_server.models import CausalLM
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.pb import generate_pb2
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
 
 
 class BloomCausalLMBatch(CausalLMBatch):
@@ -37,69 +27,6 @@ class BloomCausalLMBatch(CausalLMBatch):
 
 
 class BLOOMSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            slow_but_exact=False,
-            tp_parallel=True,
-            trust_remote_code=trust_remote_code,
-        )
-        config.pad_token_id = 3
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device=device,
-            dtype=dtype,
-            process_group=self.process_group,
-            prefix="transformer",
-        )
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = BloomForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
     @property
     def batch_type(self) -> Type[CausalLMBatch]:
         return BloomCausalLMBatch
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 10c64c6611f06f679f90a7814afef442a26f0d68..bd8176be9edb25226232fd8f7f59bfbf83fe98cf 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -1,13 +1,26 @@
 import torch
 import time
+import torch.distributed
 
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    PreTrainedTokenizerBase,
+)
 from typing import Optional, Tuple, List, Type, Dict
 
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
 from text_generation_server.models import Model
 from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.quantization import get_loader
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.models.types import (
     Batch,
@@ -63,6 +76,7 @@ class CausalLMBatch(Batch):
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.max_tokens,
+            current_tokens=len(self.input_ids),
         )
 
     @classmethod
@@ -220,7 +234,7 @@ class CausalLMBatch(Batch):
         ]
 
         # Ensure that past_key_values tensors can be updated in-place
-        if type(self.past_key_values[0]) == tuple:
+        if type(self.past_key_values[0]) is tuple:
             self.past_key_values = [list(layer) for layer in self.past_key_values]
 
         # Update tensors in-place to allow incremental garbage collection
@@ -364,7 +378,7 @@ class CausalLMBatch(Batch):
             # BLOOM Keys:   [batch_size * num_heads, head_dim, seq_length]
             # BLOOM Values: [batch_size * num_heads, seq_length, head_dim]
             # And ensure that we can update tensors in-place
-            if type(batch.past_key_values[0]) == tuple:
+            if isinstance(batch.past_key_values[0], tuple):
                 batch.past_key_values = [
                     [t.view(len(batch), -1, *t.shape[-2:]) for t in layer]
                     for layer in batch.past_key_values
@@ -478,10 +492,98 @@ class CausalLMBatch(Batch):
         return len(self.requests)
 
 
+@dataclass
+class CausalLMBatchKeysLast(CausalLMBatch):
+    keys_head_dim_last: bool = False
+
+
 class CausalLM(Model):
     def __init__(
         self,
         model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        tokenizer_class=AutoTokenizer,
+        config_class=AutoConfig,
+        batch_class=CausalLMBatch,
+    ):
+        self.quantize = quantize
+        self.batch_class = batch_class
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device(f"xpu:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            device = torch.device("cpu")
+            # Float16 doesn't exist on target.
+            dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+
+        config = config_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+        if tokenizer.pad_token_id is None:
+            if config.pad_token_id is not None:
+                tokenizer.pad_token_id = config.pad_token_id
+            elif config.eos_token_id is not None:
+                tokenizer.pad_token_id = config.eos_token_id
+            elif tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+
+        torch.distributed.barrier(group=self.process_group)
+        weights_loader = get_loader(
+            quantize=quantize, model_id=model_id, revision=revision
+        )
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device=device,
+            dtype=dtype,
+            process_group=self.process_group,
+            weights_loader=weights_loader,
+        )
+
+        prefix = ""
+        model = model_class(prefix, config, weights)
+
+        torch.distributed.barrier(group=self.process_group)
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
         speculator: Optional[str] = None,
@@ -491,8 +593,14 @@ class CausalLM(Model):
         if speculator:
             raise RuntimeError("Speculator decoding is not enabled for AutoModel")
 
+        device_count = 0
         if torch.cuda.is_available():
             device = torch.device("cuda")
+            device_count = torch.cuda.device_count()
+            dtype = torch.float16 if dtype is None else dtype
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device("xpu")
+            device_count = torch.xpu.device_count()
             dtype = torch.float16 if dtype is None else dtype
         else:
             if quantize:
@@ -512,20 +620,12 @@ class CausalLM(Model):
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map=(
-                "auto"
-                if torch.cuda.is_available() and torch.cuda.device_count() > 1
-                else None
-            ),
+            device_map=("auto" if device_count > 1 else None),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
-        if (
-            torch.cuda.is_available()
-            and torch.cuda.device_count() == 1
-            and quantize != "bitsandbytes"
-        ):
-            model = model.cuda()
+        if device_count == 1 and quantize != "bitsandbytes":
+            model = model.to(device)
 
         if tokenizer.pad_token_id is None:
             if model.config.pad_token_id is not None:
@@ -537,7 +637,12 @@ class CausalLM(Model):
             else:
                 tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 
-        super(CausalLM, self).__init__(
+        self = cls.__new__(
+            cls,
+        )
+        self.batch_class = CausalLMBatch
+        super().__init__(
+            self,
             model_id=model_id,
             model=model,
             tokenizer=tokenizer,
@@ -545,15 +650,12 @@ class CausalLM(Model):
             dtype=dtype,
             device=device,
         )
+        self.quantize = quantize
+        return self
 
     @property
     def batch_type(self) -> Type[CausalLMBatch]:
-        return CausalLMBatch
-
-    def decode(self, generated_ids: List[int]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
+        return self.batch_class
 
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
index 0d8a1b590e6bdd374e7dcad8710defd2dd38d9f5..e2719fad29da342274f70f9e146add99d5d3f982 100644
--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -816,7 +816,7 @@ class BloomModel(BloomPreTrainedModel):
 
 
 class BloomForCausalLM(BloomPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
         self.transformer = BloomModel(config, weights)
 
@@ -908,7 +908,7 @@ class BloomForCausalLM(BloomPreTrainedModel):
         loss = None
 
         if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
+            output = (logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
         return (
diff --git a/server/text_generation_server/models/custom_modeling/clip.py b/server/text_generation_server/models/custom_modeling/clip.py
index 56618bf16d72af7538ab180f771f18bc4d6266be..ab824da5b2663bfadf3cd32a00f922e6ee2cf324 100644
--- a/server/text_generation_server/models/custom_modeling/clip.py
+++ b/server/text_generation_server/models/custom_modeling/clip.py
@@ -1,4 +1,4 @@
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import torch
 from torch import nn
@@ -9,9 +9,7 @@ from transformers.modeling_attn_mask_utils import (
     _prepare_4d_attention_mask,
 )
 from transformers.modeling_outputs import (
-    BaseModelOutput,
     BaseModelOutputWithPooling,
-    ImageClassifierOutput,
 )
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 
@@ -446,11 +444,12 @@ class CLIPEncoder(nn.Module):
 
 
 class CLIPTextTransformer(nn.Module):
-    def __init__(self, config: CLIPTextConfig):
+    def __init__(self, prefix: str, config: CLIPTextConfig, weights=None):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
         self.embeddings = CLIPTextEmbeddings(config)
+        # Initialize weights and apply final processing with `self.post_init()`
         self.encoder = CLIPEncoder(
             prefix=f"{prefix}.encoder", config=config, weights=weights
         )
@@ -505,7 +504,7 @@ class CLIPTextTransformer(nn.Module):
             # text_embeds.shape = [batch_size, sequence_length, transformer.width]
             # take features from the eot embedding (eot_token is the highest number in each sequence)
             # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-            pooled_output = last_hidden_state[
+            last_hidden_state[
                 torch.arange(
                     last_hidden_state.shape[0], device=last_hidden_state.device
                 ),
@@ -515,7 +514,7 @@ class CLIPTextTransformer(nn.Module):
             ]
         else:
             # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
-            pooled_output = last_hidden_state[
+            last_hidden_state[
                 torch.arange(
                     last_hidden_state.shape[0], device=last_hidden_state.device
                 ),
@@ -536,9 +535,9 @@ class CLIPTextModel(CLIPPreTrainedModel):
 
     _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
 
-    def __init__(self, config: CLIPTextConfig):
+    def __init__(self, prefix, config: CLIPTextConfig):
         super().__init__(config)
-        self.text_model = CLIPTextTransformer(config)
+        self.text_model = CLIPTextTransformer(prefix, config)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -565,9 +564,6 @@ class CLIPTextModel(CLIPPreTrainedModel):
         >>> last_hidden_state = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
         ```"""
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
 
         return self.text_model(
             input_ids=input_ids,
@@ -580,7 +576,6 @@ class CLIPVisionTransformer(nn.Module):
     def __init__(self, prefix, config: CLIPVisionConfig, weights):
         super().__init__()
         self.config = config
-        embed_dim = config.hidden_size
 
         self.embeddings = CLIPVisionEmbeddings(
             prefix=f"{prefix}.embeddings", config=config, weights=weights
@@ -661,9 +656,6 @@ class CLIPVisionModel(CLIPPreTrainedModel):
         >>> last_hidden_state = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
 
         return self.vision_model(
             pixel_values=pixel_values,
@@ -799,14 +791,12 @@ class CLIPModel(nn.Module):
         # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
-            return_dict=return_dict,
         )
 
         text_outputs = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
-            return_dict=return_dict,
         )
 
         image_embeds = vision_outputs[1]
diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
index e088f9aa308647dafe77e9aa49cc3d6732ec2341..531c2af09350f164405591c0b7a2d312c796afbb 100644
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -28,9 +28,9 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
-from text_generation_server.models.globals import FLASH_DECODING
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -45,6 +45,7 @@ from text_generation_server.layers.layernorm import (
 from text_generation_server.layers.rotary import (
     PositionRotaryEmbedding,
 )
+from text_generation_server.utils.weights import UnquantizedWeight
 
 if SYSTEM == "cuda":
     import dropout_layer_norm
@@ -74,7 +75,8 @@ class CohereRotary(PositionRotaryEmbedding):
 
             rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
         elif SYSTEM == "rocm":
-            from vllm._C import ops
+            # from vllm._C import ops
+            import vllm._custom_ops as ops
 
             # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
             # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
@@ -83,6 +85,12 @@ class CohereRotary(PositionRotaryEmbedding):
 
             # Inplace operation, updating query and key.
             ops.rotary_embedding(query, key, head_size, cos, sin, False)
+        elif SYSTEM == "ipex":
+            import intel_extension_for_pytorch as ipex
+
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), False
+            )
         else:
             raise ValueError(
                 "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
@@ -99,7 +107,7 @@ class CohereLayerNorm(nn.Module):
         self.eps = eps
 
     def forward(self, hidden_states):
-        if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm":
+        if hidden_states.shape[-1] > 8192 or SYSTEM != "cuda":
             hidden_states = hidden_states.reshape(
                 -1, self.weight.shape[0], self.weight.shape[1]
             )
@@ -163,20 +171,19 @@ def _load_gqa(config, prefix: str, weights):
 
     weight = weights.get_multi_weights_col(
         prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
         dim=0,
     )
 
-    if config.quantize not in ["gptq", "awq", "marlin"]:
-        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
 
         head_size = config.hidden_size // config.num_attention_heads
         num_heads = config.num_attention_heads // weights.process_group.size()
         num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
-        assert list(weight.shape) == [
+        assert list(weight.weight.shape) == [
             (num_heads + 2 * num_key_value_heads) * head_size,
             config.hidden_size,
-        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
     if config.attention_bias:
         w = [
@@ -187,9 +194,7 @@ def _load_gqa(config, prefix: str, weights):
     else:
         bias = None
 
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=bias, quantize=config.quantize)
-    )
+    return TensorParallelColumnLinear(get_linear(weight, bias=bias))
 
 
 class FlashCohereAttention(torch.nn.Module):
@@ -224,6 +229,7 @@ class FlashCohereAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.use_qk_norm = config.use_qk_norm
         if self.use_qk_norm:
@@ -260,8 +266,8 @@ class FlashCohereAttention(torch.nn.Module):
         cu_seqlen_prefill,
         kv_cache,
         block_tables,
-        input_lengths,
         slots,
+        seqlen,
         max_s,
     ):
         qkv = self.query_key_value(hidden_states)
@@ -286,35 +292,37 @@ class FlashCohereAttention(torch.nn.Module):
 
         self.rotary_emb(query, key, cos, sin)
 
-        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
-
-        # output tensor
-        attn_output = torch.empty_like(query)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                key,
-                value,
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(
@@ -363,9 +371,9 @@ class CohereMLP(nn.Module):
 
 
 class FlashCohereLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, prefix: str, layer_id, config, weights):
         super().__init__()
-        prefix = f"model.layers.{layer_id}"
+        prefix = f"{prefix}.layers.{layer_id}"
         self.self_attn = FlashCohereAttention(
             prefix=f"{prefix}.self_attn", config=config, weights=weights
         )
@@ -388,7 +396,7 @@ class FlashCohereLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
@@ -402,7 +410,7 @@ class FlashCohereLayer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
 
@@ -416,18 +424,19 @@ class FlashCohereLayer(nn.Module):
 
 
 class FlashCohereModel(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
         process_group = weights.process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
         self.embed_tokens = TensorParallelEmbedding(
-            prefix="model.embed_tokens", weights=weights
+            prefix=f"{prefix}.embed_tokens", weights=weights
         )
         self.layers = nn.ModuleList(
             [
                 FlashCohereLayer(
+                    prefix,
                     layer_id,
                     config,
                     weights,
@@ -436,7 +445,7 @@ class FlashCohereModel(torch.nn.Module):
             ]
         )
         self.norm = FastLayerNorm.load_no_bias(
-            prefix="model.norm", weights=weights, eps=config.layer_norm_eps
+            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_eps
         )
 
         self.gradient_checkpointing = False
@@ -453,7 +462,7 @@ class FlashCohereModel(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: torch.Tensor,
         max_s: int,
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
@@ -476,7 +485,7 @@ class FlashCohereModel(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -486,10 +495,15 @@ class FlashCohereModel(torch.nn.Module):
 
 
 class FlashCohereForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
-        self.model = FlashCohereModel(config, weights)
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = FlashCohereModel(prefix, config, weights)
         try:
             self.lm_head = SpeculativeHead.load(
                 config,
@@ -499,7 +513,7 @@ class FlashCohereForCausalLM(torch.nn.Module):
         except RuntimeError:
             self.lm_head = SpeculativeHead.load(
                 config,
-                prefix="model.embed_tokens",
+                prefix=f"{prefix}.embed_tokens",
                 weights=weights,
             )
         self.logit_scale = config.logit_scale
@@ -512,7 +526,7 @@ class FlashCohereForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -525,7 +539,7 @@ class FlashCohereForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
         if lm_head_indices is not None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
index aea7f3994a2fa6b0c5354a6398f05e507cd8d1f6..f70bff4f881a3c77f6979a7624771eddfe793ea1 100644
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -20,6 +20,7 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
 
 if SYSTEM != "ipex":
@@ -28,7 +29,7 @@ if SYSTEM != "ipex":
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     FastLinear,
@@ -44,7 +45,6 @@ from text_generation_server.layers.rotary import (
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
-from text_generation_server.utils.log import log_once
 
 
 class DbrxAttentionConfig(PretrainedConfig):
@@ -105,6 +105,12 @@ class DbrxFFNConfig(PretrainedConfig):
 
 
 class DbrxConfig(PretrainedConfig):
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+    }
+
     def __init__(
         self,
         d_model: int = 2048,
@@ -157,6 +163,12 @@ class DbrxConfig(PretrainedConfig):
             **kwargs,
         )
 
+    @property
+    def num_key_value_heads(self):
+        # We can't use the attribute map, since this the number of KV
+        # heads is not top-level.
+        return self.attn_config.kv_n_heads
+
 
 def promote_scalar(x: torch.Tensor) -> torch.Tensor:
     return x.view(1) if len(x.size()) == 0 else x
@@ -235,10 +247,10 @@ def _load_experts_quantized(config, prefix, weights, cls):
 
         if cls == TensorParallelRowLinear:
             expert_slice = expert_slice.t().contiguous()
-            linear = get_linear(expert_slice, None, config.quantize)
+            linear = get_linear(expert_slice, None)
             experts.append(cls(linear, weights.process_group))
         else:
-            linear = get_linear(expert_slice, None, config.quantize)
+            linear = get_linear(expert_slice, None)
             experts.append(cls(linear))
 
     return experts
@@ -277,6 +289,7 @@ class DbrxAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -298,7 +311,7 @@ class DbrxAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         qkv = self.query_key_value(hidden_states)
@@ -317,35 +330,37 @@ class DbrxAttention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
-
-        # output tensor
-        attn_output = torch.empty_like(query)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -381,7 +396,7 @@ class DbrxNormAttentionNorm(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         normed_hidden_states, res = self.norm_1(hidden_states, residual)
@@ -395,7 +410,7 @@ class DbrxNormAttentionNorm(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
 
@@ -593,9 +608,9 @@ class DenseMoE(nn.Module):
 
 
 class DbrxLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, prefix: str, layer_id, config, weights):
         super().__init__()
-        prefix = f"transformer.blocks.{layer_id}"
+        prefix = f"{prefix}.blocks.{layer_id}"
 
         self.attn = DbrxNormAttentionNorm(
             prefix=f"{prefix}.norm_attn_norm", config=config, weights=weights
@@ -614,7 +629,7 @@ class DbrxLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         # Self Attention
@@ -627,7 +642,7 @@ class DbrxLayer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
 
@@ -637,16 +652,17 @@ class DbrxLayer(nn.Module):
 
 
 class DbrxModel(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
         self.embed_tokens = TensorParallelEmbedding(
-            prefix="transformer.wte", weights=weights
+            prefix=f"{prefix}.wte", weights=weights
         )
 
         self.layers = nn.ModuleList(
             [
                 DbrxLayer(
+                    prefix,
                     layer_id,
                     config,
                     weights,
@@ -655,7 +671,7 @@ class DbrxModel(torch.nn.Module):
             ]
         )
         self.norm = FastLayerNorm.load_no_bias(
-            prefix="transformer.norm_f", weights=weights, eps=1e-5
+            prefix=f"{prefix}.norm_f", weights=weights, eps=1e-5
         )
 
         self.head_size = self.layers[0].attn.self_attn.head_size
@@ -670,7 +686,7 @@ class DbrxModel(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
@@ -692,7 +708,7 @@ class DbrxModel(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -702,10 +718,15 @@ class DbrxModel(torch.nn.Module):
 
 
 class FlashDbrxForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
-        self.model = DbrxModel(config, weights)
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        self.model = DbrxModel(prefix, config, weights)
         self.lm_head = SpeculativeHead.load(
             config,
             prefix="lm_head",
@@ -720,7 +741,7 @@ class FlashDbrxForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -733,7 +754,7 @@ class FlashDbrxForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
         if lm_head_indices is not None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..82aafb493ba005b55484c3faa263eef9b2f02c85
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
@@ -0,0 +1,669 @@
+# coding=utf-8
+# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Type
+
+import torch
+import torch.distributed
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+
+from text_generation_server.layers import (
+    FastLinear,
+    SpeculativeHead,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    get_linear,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+    attention,
+    paged_attention,
+)
+from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
+from text_generation_server.layers.layernorm import FastRMSNorm
+from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
+from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import Weights
+
+if SYSTEM == "rocm":
+    try:
+        # from vllm import _custom_C
+        from vllm import _custom_ops
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+
+
+class DeepseekV2Config(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=2,
+        n_routed_experts=160,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="gready",
+        n_group=8,
+        topk_group=3,
+        num_experts_per_tok=6,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func="softmax",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Deepseek V2 models."
+            )
+
+        if ep_size != 1:
+            raise ValueError(
+                f"Currently only ep_size == 1 is supported for Deepseek V2 models, was {ep_size}"
+            )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekV2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights: Weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.kv_lora_rank = config.kv_lora_rank
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.value_head_size = config.v_head_dim
+        self.head_pad_size = max(self.head_size, self.value_head_size)
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.qk_rope_head_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        mscale = get_mscale(
+            self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
+        )
+        self.softmax_scale = self.head_size**-0.5 * mscale * mscale
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        if self.q_lora_rank is None:
+            self.q_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.q_proj",
+                weights=weights,
+                bias=config.attention_bias,
+            )
+        else:
+            self.q_a_proj = get_linear(
+                weight=weights.get_weights(f"{prefix}.q_a_proj"),
+                bias=(
+                    weights.get_tensor(f"{prefix}.q_a_proj.bias")
+                    if config.attention_bias
+                    else None
+                ),
+            )
+            self.q_a_layernorm = FastRMSNorm.load(
+                prefix=f"{prefix}.q_a_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.q_b_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.q_b_proj",
+                weights=weights,
+                bias=config.attention_bias,
+            )
+
+        self.kv_a_proj_with_mqa = get_linear(
+            weight=weights.get_weights(f"{prefix}.kv_a_proj_with_mqa"),
+            bias=(
+                weights.get_tensor(f"{prefix}.kv_a_proj_with_mqa.bias")
+                if config.attention_bias
+                else None
+            ),
+        )
+
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
+        self.kv_a_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.kv_b_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.kv_b_proj",
+            weights=weights,
+            bias=config.attention_bias,
+        )
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        cu_seqlen_prefill: torch.Tensor,
+        kv_cache: KVCache,
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ):
+        if self.q_lora_rank is None:
+            query = self.q_proj(hidden_states)
+        else:
+            query = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))[0])
+        query = query.view(-1, self.num_heads, self.head_size)
+
+        _, query_pe = torch.split(
+            query, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, key_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+
+        key_pe = key_pe.view(-1, 1, self.qk_rope_head_dim)
+        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv.contiguous())[0]).view(
+            -1, self.num_key_value_heads, self.qk_nope_head_dim + self.value_head_size
+        )
+
+        key_nope, value = torch.split(
+            kv, [self.qk_nope_head_dim, self.value_head_size], dim=-1
+        )
+
+        batch_size, heads, head_dim = query_pe.shape
+        query_pe = (
+            query_pe.view(batch_size, heads, head_dim // 2, 2)
+            .transpose(2, 3)
+            .reshape(batch_size, heads, head_dim)
+        )
+        batch_size, heads, head_dim = key_pe.shape
+        key_pe = (
+            key_pe.view(batch_size, heads, head_dim // 2, 2)
+            .transpose(2, 3)
+            .reshape(batch_size, heads, head_dim)
+        )
+        self.rotary_emb(query_pe, key_pe, cos, sin)
+
+        query[..., self.qk_nope_head_dim :] = query_pe
+        key = torch.empty_like(query)
+        key[..., : self.qk_nope_head_dim] = key_nope
+        key[..., self.qk_nope_head_dim :] = key_pe
+
+        # We need to pad the heads because Flash Attention does not support
+        # qk and v with different head sizes.
+        query = torch.nn.functional.pad(
+            query, (0, self.head_pad_size - self.head_size), value=0
+        )
+        key = torch.nn.functional.pad(
+            key, (0, self.head_pad_size - self.head_size), value=0
+        )
+        value = torch.nn.functional.pad(
+            value, (0, self.head_pad_size - self.value_head_size), value=0
+        )
+
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache,
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+                kv_scales=self.kv_scales,
+            )
+
+        # Remove padding.
+        attn_output = attn_output[..., : self.value_head_size]
+
+        return self.o_proj(
+            attn_output.reshape(-1, self.num_heads * self.value_head_size)
+        )
+
+
+class DeepseekV2MLP(nn.Module):
+    def __init__(self, prefix: str, config, weights, intermediate_size: int):
+        super().__init__()
+        self.hidden_act = config.hidden_act
+        if self.hidden_act != "silu":
+            # Bail out because MoE only supports silu.
+            raise NotImplementedError(
+                "Currently only `silu` is supported as an activation for Deepseek V2."
+            )
+        self.act = ACT2FN[self.hidden_act]
+
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.intermediate_size = intermediate_size // weights.process_group.size()
+
+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
+    def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
+        # if (
+        #     SYSTEM == "rocm"
+        #     and self.hidden_act == "silu"
+        #     and hidden_states.dtype == torch.float16
+        #     and hidden_states.shape[0] == 1
+        #     and not self.quantize
+        # ):
+        #     out = torch.empty(
+        #         hidden_states.shape[0],
+        #         self.intermediate_size,
+        #         dtype=hidden_states.dtype,
+        #         device="cuda",
+        #     )
+        #     _custom_ops.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
+        #     return self.down_proj(out, reduce=reduce)
+        # else:
+
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
+        )
+
+
+class DeepseekV2MoE(nn.Module):
+    def __init__(
+        self,
+        prefix,
+        config: DeepseekV2Config,
+        moe_layer_cls: Type[MoELayer],
+        weights,
+    ):
+        super().__init__()
+
+        self.hidden_dim = config.hidden_size
+        self.moe_intermediate_size = (
+            config.moe_intermediate_size // weights.process_group.size()
+        )
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        # Gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        self.moe_layer = moe_layer_cls(
+            prefix=f"{prefix}.experts",
+            n_experts=config.n_routed_experts,
+            n_expert_group=config.n_group,
+            renormalize=config.norm_topk_prob,
+            topk=config.num_experts_per_tok,
+            topk_group=config.topk_group,
+            weights=weights,
+        )
+        assert isinstance(self.moe_layer, MoELayer)
+
+        if config.n_shared_experts is not None:
+            self.shared_experts = DeepseekV2MLP(
+                prefix=f"{prefix}.shared_experts",
+                config=config,
+                weights=weights,
+                intermediate_size=config.moe_intermediate_size
+                * config.n_shared_experts,
+            )
+        else:
+            self.shared_experts = None
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(x, reduce=False)
+        else:
+            shared_output = None
+
+        router_logits = self.gate(x)
+
+        out = self.moe_layer(x, gating_output=router_logits)
+
+        if shared_output is not None:
+            out = out + shared_output
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class DeepseekV2Layer(nn.Module):
+    def __init__(self, prefix, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+
+        self.self_attn = DeepseekV2Attention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+        )
+
+        if (
+            config.n_routed_experts is not None
+            and layer_id >= config.first_k_dense_replace
+            and layer_id % config.moe_layer_freq == 0
+        ):
+            moe_layer_cls = (
+                SparseMoELayer
+                if SparseMoELayer.is_supported(weights)
+                else DenseMoELayer
+            )
+            self.mlp = DeepseekV2MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
+        else:
+            self.mlp = DeepseekV2MLP(
+                prefix=f"{prefix}.mlp",
+                config=config,
+                weights=weights,
+                intermediate_size=config.intermediate_size,
+            )
+
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        cu_seqlen_prefill: torch.Tensor,
+        kv_cache,
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ):
+        normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, residual = self.post_attention_layernorm(
+            attn_output, residual
+        )
+
+        output = self.mlp(normed_attn_res_output)
+
+        return output, residual
+
+
+class DeepseekV2Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights: Weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                DeepseekV2Layer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashDeepseekV2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights: Weights):
+        super().__init__()
+
+        self.model = DeepseekV2Model(
+            "model" if not prefix else f"{prefix}.model", config, weights
+        )
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
index cfa6b2fe4b3fcd05ccf207a5b1c00032ae79b4b2..ebf1b80eb03fec826bea2a5dc1b4060683b982cb 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
@@ -25,11 +25,10 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
-
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -37,11 +36,15 @@ from text_generation_server.layers import (
     TensorParallelEmbedding,
     SpeculativeHead,
     get_linear,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
+from text_generation_server.utils.weights import UnquantizedWeight
 
 
 class Gemma2Config(PretrainedConfig):
@@ -102,7 +105,7 @@ class Gemma2Config(PretrainedConfig):
 
 class Gemma2FastRMSNorm(FastRMSNorm):
     @classmethod
-    def load(cls, prefix, weights, eps=1e-6):
+    def load(cls, prefix: str, weights, eps=1e-6):
         dtype = weights.dtype
         weights.dtype = torch.float32
         weight = weights.get_tensor(f"{prefix}.weight") + 1
@@ -123,7 +126,7 @@ class Gemma2FastRMSNorm(FastRMSNorm):
         return hidden_states.to(self.dtype), residual
 
 
-def load_attention(config, prefix, weights):
+def load_attention(config, prefix: str, weights):
     if config.num_attention_heads != config.num_key_value_heads:
         return _load_gqa(config, prefix, weights)
     else:
@@ -141,28 +144,27 @@ def _load_gqa(config, prefix: str, weights):
 
     weight = weights.get_multi_weights_col(
         prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
         dim=0,
     )
 
-    if config.quantize not in ["gptq", "awq", "marlin"]:
-        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
 
         head_size = config.head_dim
         num_heads = config.num_attention_heads // weights.process_group.size()
         num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
-        assert list(weight.shape) == [
+        assert list(weight.weight.shape) == [
             (num_heads + 2 * num_key_value_heads) * head_size,
             config.hidden_size,
-        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=None, quantize=config.quantize)
-    )
+    return TensorParallelColumnLinear(get_linear(weight, bias=None))
 
 
 class FlashGemma2Attention(torch.nn.Module):
-    def __init__(self, prefix: str, config, weights, causal: bool, is_sliding: bool):
+    def __init__(
+        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
+    ):
         super().__init__()
         self.num_heads = config.num_attention_heads
         self.head_size = config.head_dim
@@ -191,15 +193,35 @@ class FlashGemma2Attention(torch.nn.Module):
         self.num_key_value_heads = (
             config.num_key_value_heads // weights.process_group.size()
         )
+        self.softcap = config.attn_logit_softcapping
+
+        query_key_value = load_attention(config, prefix, weights)
+        self.query_key_value = TensorParallelMultiAdapterLinear.load(
+            query_key_value,
+            layer_id,
+            ["q_proj", "k_proj", "v_proj"],
+            sizes=[
+                self.head_size * config.num_attention_heads,
+                self.head_size * config.num_key_value_heads,
+                self.head_size * config.num_key_value_heads,
+            ],
+            process_group=weights.process_group,
+        )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
-        self.query_key_value = load_attention(config, prefix, weights)
-
-        self.o_proj = TensorParallelRowLinear.load(
+        o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
             bias=False,
         )
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            layer_id,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+
         self.num_groups = self.num_heads // self.num_key_value_heads
         self.kv_head_mapping = torch.arange(
             0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
@@ -214,10 +236,11 @@ class FlashGemma2Attention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
+        adapter_data,
     ):
-        qkv = self.query_key_value(hidden_states)
+        qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
             [
                 self.head_size * self.num_heads,
@@ -230,46 +253,51 @@ class FlashGemma2Attention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
-
-        # output tensor
-        attn_output = torch.empty_like(query)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
-                causal=self.causal,
+            attn_output = attention(
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.window_size,
+                softcap=self.softcap,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                softcap=self.softcap,
+                kv_scales=self.kv_scales,
             )
 
-        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
 
 
 class Gemma2MLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix, config, weights, layer_id):
         super().__init__()
-        act = config.hidden_act
+        act = config.hidden_activation
         self.act = (
             ACT2FN[act]
             if "gelu" not in act
@@ -281,40 +309,65 @@ class Gemma2MLP(nn.Module):
             )
         )
         # Fuse gate and up proj
-        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+        gate_up_proj = TensorParallelColumnLinear.load_multi(
             config,
             prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
             weights=weights,
             dim=0,
             bias=False,
         )
-        self.down_proj = TensorParallelRowLinear.load(
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            layer_id,
+            ["gate_proj", "up_proj"],
+            sizes=[
+                config.intermediate_size,
+                config.intermediate_size,
+            ],
+            process_group=weights.process_group,
+        )
+
+        down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
             bias=False,
         )
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            layer_id,
+            "down_proj",
+            process_group=weights.process_group,
+        )
+
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()
         )
 
-    def forward(self, hidden_states):
-        gate_up_states = self.gate_up_proj(hidden_states)
+    def forward(self, hidden_states, adapter_data):
+        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
         gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        )
 
 
 class FlashGemma2Layer(nn.Module):
-    def __init__(self, prefix, config, weights, causal: bool, is_sliding: bool):
+    def __init__(
+        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
+    ):
         super().__init__()
         self.self_attn = FlashGemma2Attention(
             prefix=f"{prefix}.self_attn",
             config=config,
             weights=weights,
+            layer_id=layer_id,
             causal=causal,
             is_sliding=is_sliding,
         )
-        self.mlp = Gemma2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.mlp = Gemma2MLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
+        )
 
         self.input_layernorm = Gemma2FastRMSNorm.load(
             prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
@@ -345,8 +398,9 @@ class FlashGemma2Layer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
+        adapter_data,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -359,8 +413,9 @@ class FlashGemma2Layer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
+            adapter_data,
         )
 
         # faster post attention rms norm
@@ -369,14 +424,14 @@ class FlashGemma2Layer(nn.Module):
         res = normed_attn_res_output
 
         pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
-        mlp_output = self.mlp(pre_normed)
+        mlp_output = self.mlp(pre_normed, adapter_data)
         post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)
 
         return post_hidden_states, normed_attn_res_output
 
 
 class FlashGemma2Model(torch.nn.Module):
-    def __init__(self, prefix, config, weights, causal: bool):
+    def __init__(self, prefix: str, config, weights, causal: bool):
         super().__init__()
 
         process_group = weights.process_group
@@ -388,6 +443,7 @@ class FlashGemma2Model(torch.nn.Module):
                     prefix=f"{prefix}.layers.{layer_id}",
                     config=config,
                     weights=weights,
+                    layer_id=layer_id,
                     causal=causal,
                     is_sliding=layer_id % 2 == 0,
                 )
@@ -410,8 +466,9 @@ class FlashGemma2Model(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
+        adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
@@ -432,8 +489,9 @@ class FlashGemma2Model(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
+                adapter_data,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -442,7 +500,7 @@ class FlashGemma2Model(torch.nn.Module):
 
 
 class FlashGemma2ForCausalLM(torch.nn.Module):
-    def __init__(self, prefix, config, weights, causal: bool):
+    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
         super().__init__()
 
         embed_norm = config.hidden_size**0.5
@@ -468,6 +526,8 @@ class FlashGemma2ForCausalLM(torch.nn.Module):
             config=config,
             weights=weights,
         )
+        self.softcap = config.final_logit_softcapping
+        assert isinstance(self.softcap, float)
 
     def forward(
         self,
@@ -477,7 +537,7 @@ class FlashGemma2ForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -491,10 +551,16 @@ class FlashGemma2ForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
+            adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
         logits, speculative_logits = self.lm_head(hidden_states)
+
+        logits /= self.softcap
+        logits = torch.tanh(logits)
+        logits *= self.softcap
+
         return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
index 842df0d4d2d1be4f3ece2c7b1a5bba7926ac93ac..ad3be80e51dd5d3d576825e8135ab0a14d202234 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -25,11 +25,10 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
-
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -38,10 +37,12 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
+from text_generation_server.utils.weights import UnquantizedWeight
 
 
 class GemmaConfig(PretrainedConfig):
@@ -102,7 +103,7 @@ class GemmaConfig(PretrainedConfig):
 
 class GemmaFastRMSNorm(FastRMSNorm):
     @classmethod
-    def load(cls, prefix, weights, eps=1e-6):
+    def load(cls, prefix: str, weights, eps=1e-6):
         dtype = weights.dtype
         weights.dtype = torch.float32
         weight = weights.get_tensor(f"{prefix}.weight") + 1
@@ -123,7 +124,7 @@ class GemmaFastRMSNorm(FastRMSNorm):
         return hidden_states.to(self.dtype), residual
 
 
-def load_attention(config, prefix, weights):
+def load_attention(config, prefix: str, weights):
     if config.num_attention_heads != config.num_key_value_heads:
         return _load_gqa(config, prefix, weights)
     else:
@@ -141,24 +142,21 @@ def _load_gqa(config, prefix: str, weights):
 
     weight = weights.get_multi_weights_col(
         prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
         dim=0,
     )
 
-    if config.quantize not in ["gptq", "awq", "marlin"]:
-        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
 
         head_size = config.head_dim
         num_heads = config.num_attention_heads // weights.process_group.size()
         num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
-        assert list(weight.shape) == [
+        assert list(weight.weight.shape) == [
             (num_heads + 2 * num_key_value_heads) * head_size,
             config.hidden_size,
-        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=None, quantize=config.quantize)
-    )
+    return TensorParallelColumnLinear(get_linear(weight, bias=None))
 
 
 class FlashGemmaAttention(torch.nn.Module):
@@ -188,6 +186,7 @@ class FlashGemmaAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -209,7 +208,7 @@ class FlashGemmaAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         qkv = self.query_key_value(hidden_states)
@@ -225,43 +224,45 @@ class FlashGemmaAttention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
-
-        # output tensor
-        attn_output = torch.empty_like(query)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 causal=self.causal,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
 
 
 class GemmaMLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
         act = config.hidden_act
         self.act = (
@@ -299,7 +300,7 @@ class GemmaMLP(nn.Module):
 
 
 class FlashGemmaLayer(nn.Module):
-    def __init__(self, prefix, config, weights, causal: bool):
+    def __init__(self, prefix: str, config, weights, causal: bool):
         super().__init__()
         self.self_attn = FlashGemmaAttention(
             prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal
@@ -325,7 +326,7 @@ class FlashGemmaLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
@@ -339,7 +340,7 @@ class FlashGemmaLayer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
 
@@ -354,7 +355,7 @@ class FlashGemmaLayer(nn.Module):
 
 
 class FlashGemmaModel(torch.nn.Module):
-    def __init__(self, prefix, config, weights, causal: bool):
+    def __init__(self, prefix: str, config, weights, causal: bool):
         super().__init__()
 
         process_group = weights.process_group
@@ -387,7 +388,7 @@ class FlashGemmaModel(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
@@ -409,7 +410,7 @@ class FlashGemmaModel(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -419,7 +420,7 @@ class FlashGemmaModel(torch.nn.Module):
 
 
 class FlashGemmaForCausalLM(torch.nn.Module):
-    def __init__(self, prefix, config, weights, causal: bool):
+    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
         super().__init__()
 
         embed_norm = config.hidden_size**0.5
@@ -454,7 +455,7 @@ class FlashGemmaForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -468,7 +469,7 @@ class FlashGemmaForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
         if lm_head_indices is not None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
index 9f8001468dd39198249e1ad278bd3f099142c2a5..906b34c12b29cc58ccb071962e331b36c1c24034 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@@ -24,11 +24,10 @@ import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
-
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -37,6 +36,7 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 
 
 def load_qkv(config, prefix: str, weights, head_size, num_heads):
@@ -61,7 +61,6 @@ def _load_qkv_gptq(config, prefix: str, weights):
     # Weights
     weight = weights.get_weights_col_packed_qkv(
         f"{prefix}.c_attn",
-        config.quantize,
         config.num_attention_heads,
         config.num_attention_heads,
     )
@@ -83,7 +82,7 @@ def _load_qkv_gptq(config, prefix: str, weights):
     bias = torch.cat(tensors, dim=0)
     bias = bias.to(device=weights.device)
 
-    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias))
 
 
 def _load_qkv(config, prefix: str, weights, head_size, num_heads):
@@ -130,14 +129,14 @@ def _load_qkv(config, prefix: str, weights, head_size, num_heads):
         3 * num_heads * head_size
     ], f"{weight.shape} != {[3 * num_heads * head_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias))
 
 
 def load_row(config, prefix: str, weights, bias: bool):
     """load_row, but with transposed weight matrices."""
 
     if config.quantize == "gptq":
-        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+        weight = weights.get_weights_row(prefix)
     else:
         weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
 
@@ -148,16 +147,14 @@ def load_row(config, prefix: str, weights, bias: bool):
         bias = None
 
     return TensorParallelRowLinear(
-        get_linear(weight, bias, config.quantize), process_group=weights.process_group
+        get_linear(weight, bias), process_group=weights.process_group
     )
 
 
 def load_col(config, prefix: str, weights, bias: bool):
     """load_col, but with transposed weight matrices."""
     if config.quantize == "gptq":
-        weight = weights.get_multi_weights_col(
-            [prefix], quantize=config.quantize, dim=1
-        )
+        weight = weights.get_multi_weights_col([prefix], dim=1)
     else:
         weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
 
@@ -166,7 +163,7 @@ def load_col(config, prefix: str, weights, bias: bool):
     else:
         bias = None
 
-    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias))
 
 
 class FlashGPT2Attention(torch.nn.Module):
@@ -197,6 +194,7 @@ class FlashGPT2Attention(torch.nn.Module):
             head_size=self.head_size,
             num_heads=self.num_heads,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = load_row(
             config,
@@ -216,7 +214,7 @@ class FlashGPT2Attention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         query, key, value = self.query_key_value(hidden_states).split(
@@ -226,42 +224,44 @@ class FlashGPT2Attention(torch.nn.Module):
         key = key.view(-1, self.num_heads, self.head_size)
         value = value.view(-1, self.num_heads, self.head_size)
 
-        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
-
-        # output tensor
-        attn_output = torch.empty_like(query)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                key,
-                value,
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
 
 
 class GPT2MLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
         act = config.activation_function
         self.act = (
@@ -298,7 +298,7 @@ class GPT2MLP(nn.Module):
 
 
 class FlashGPT2Layer(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
         self.self_attn = FlashGPT2Attention(
             prefix=f"{prefix}.attn", config=config, weights=weights
@@ -322,7 +322,7 @@ class FlashGPT2Layer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         residual = hidden_states
@@ -335,7 +335,7 @@ class FlashGPT2Layer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
 
@@ -350,7 +350,7 @@ class FlashGPT2Layer(nn.Module):
 
 
 class FlashGPT2Model(torch.nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
         process_group = weights.process_group
@@ -388,7 +388,7 @@ class FlashGPT2Model(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         true_max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
@@ -404,7 +404,7 @@ class FlashGPT2Model(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -414,7 +414,7 @@ class FlashGPT2Model(torch.nn.Module):
 
 
 class FlashGPT2ForCausalLM(torch.nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
         self.embed_tokens = TensorParallelEmbedding(
@@ -441,7 +441,7 @@ class FlashGPT2ForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -457,7 +457,7 @@ class FlashGPT2ForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             true_max_s=max_s,
             prefill_cache_indices=prefill_cache_indices,
diff --git a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa8849500344be2ead1a94233716186b29fe1d2f
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
@@ -0,0 +1,414 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+
+
+def load_attention(config, prefix: str, weights):
+    return TensorParallelColumnLinear.load_multi(
+        config,
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+        weights=weights,
+        bias=False,
+    )
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    weight = weights.get_weights_row(prefix)
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+
+    linear = get_linear(weight, bias)
+    return TensorParallelRowLinear(linear, process_group=weights.process_group)
+
+
+class GPTJRotary(PositionRotaryEmbedding):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            import rotary_emb
+
+            q1 = query[..., ::2]
+            q2 = query[..., 1::2]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., ::2]
+            k2 = key[..., 1::2]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            # from vllm._C import ops
+            import vllm._custom_ops as ops
+
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            ops.rotary_embedding(query, key, head_size, cos, sin, False)
+        elif SYSTEM == "ipex":
+            import intel_extension_for_pytorch as ipex
+
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), False
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+
+class FlashGPTJAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+
+        self.head_size = self.hidden_size // self.num_heads
+        self.softmax_scale = self.head_size**-0.5
+        self.rotary_dim = config.rotary_dim
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.query_key_value = load_attention(
+            config,
+            prefix=prefix,
+            weights=weights,
+        )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
+        self.o_proj = load_row(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.kv_head_mapping = torch.arange(
+            0, self.num_heads, dtype=torch.int32, device=weights.device
+        )
+
+        self.rotary_emb = GPTJRotary.static(
+            config=config,
+            dim=self.rotary_dim,
+            base=10000,
+            device=weights.device,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        query, key, value = self.query_key_value(hidden_states).split(
+            self.head_size * self.num_heads, dim=1
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_heads, self.head_size)
+        value = value.view(-1, self.num_heads, self.head_size)
+
+        # Compute rotary embeddings on rotary_ndims
+        if self.rotary_dim is not None:
+            self.rotary_emb(
+                query[..., : self.rotary_dim], key[..., : self.rotary_dim], cos, sin
+            )
+        else:
+            self.rotary_emb(query, key, cos, sin)
+
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache,
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+                kv_scales=self.kv_scales,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class GPTJMLP(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        act = config.activation_function
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        self.fc_in = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.fc_in", weights=weights, bias=True
+        )
+
+        self.fc_out = load_row(
+            config,
+            prefix=f"{prefix}.fc_out",
+            weights=weights,
+            bias=True,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        return self.fc_out(hidden_states)
+
+
+class FlashGPTJLayer(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.self_attn = FlashGPTJAttention(
+            prefix=f"{prefix}.attn", config=config, weights=weights
+        )
+        self.mlp = GPTJMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = FastLayerNorm.load(
+            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        # Self Attention
+        attn_output = self.self_attn(
+            hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        feed_forward_hidden_states = self.mlp(hidden_states)
+
+        return attn_output + feed_forward_hidden_states, residual
+
+
+class FlashGPTJModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.config = config
+
+        self.wte = TensorParallelEmbedding(prefix=f"{prefix}.wte", weights=weights)
+        self.layers = nn.ModuleList(
+            [
+                FlashGPTJLayer(
+                    prefix=(
+                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.ln_f = FastLayerNorm.load(
+            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGPTJForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+        self.model = FlashGPTJModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            prefill_cache_indices=prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 75182b6de62a0f19d1cd0e96cb9be26cb2fc455d..2e4fb872d3c40ea7a57afbf6edadf315c3ffdfe7 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -18,22 +18,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional, Tuple
+from contextlib import contextmanager
+from typing import List, Optional, Tuple, Type
 
 import torch
 import torch.distributed
 
 from torch import nn
 from transformers.activations import ACT2FN
-from typing import Optional, List, Tuple
 
+from text_generation_server.layers.attention import (
+    KVCache,
+    get_kv_scales,
+)
+from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
-from text_generation_server.models.globals import FLASH_DECODING
 from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
@@ -45,7 +49,18 @@ from text_generation_server.layers import (
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
+    FastLayerNorm,
+)
+from text_generation_server.layers import (
+    FastLinear,
 )
+from text_generation_server.utils.weights import (
+    Weights,
+)
+from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
+
+if SYSTEM != "ipex":
+    pass
 
 if SYSTEM == "rocm":
     try:
@@ -55,24 +70,23 @@ if SYSTEM == "rocm":
         raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
 
 
-def load_attention(config, prefix, weights, layer_id):
+def load_attention(config, prefix: str, weights, layer_id):
     # Only defined in granite.
     bias = getattr(config, "attention_bias", False)
     head_size = config.hidden_size // config.num_attention_heads
     sizes = None
     prefixes = None
 
-
     if config.model_type == "phi3":
-        prefix = f"{prefix}.qkv_proj"
         base_layer = TensorParallelColumnLinear.load_qkv(
             config,
-            prefix=prefix,
+            prefix=f"{prefix}.qkv_proj",
             weights=weights,
             bias=bias,
             num_heads=config.num_attention_heads,
             num_key_value_heads=config.num_key_value_heads,
         )
+        prefixes = ["qkv_proj"]
     elif config.model_type == "baichuan":
         prefix = f"{prefix}.W_pack"
         base_layer = TensorParallelColumnLinear.load_qkv(
@@ -83,6 +97,7 @@ def load_attention(config, prefix, weights, layer_id):
             num_heads=config.num_attention_heads,
             num_key_value_heads=config.num_key_value_heads,
         )
+        prefixes = [prefix]
     else:
         prefixes = ["q_proj", "k_proj", "v_proj"]
         sizes = [
@@ -107,6 +122,19 @@ def load_attention(config, prefix, weights, layer_id):
     )
 
 
+@contextmanager
+def no_fp8(weights: Weights):
+    """De-activate fp8 auto conversion for the duration of this context manager"""
+    weights_loader = weights.weights_loader
+    if isinstance(weights_loader, HybridFP8UnquantLoader) and weights_loader.to_fp8:
+        weights_loader = HybridFP8UnquantLoader(
+            weights_loader.activation_scale_ub, to_fp8=False
+        )
+
+    with weights.use_loader(weights_loader):
+        yield
+
+
 class FlashLlamaAttention(torch.nn.Module):
     def __init__(
         self,
@@ -125,19 +153,17 @@ class FlashLlamaAttention(torch.nn.Module):
         config.num_key_value_heads = getattr(
             config, "num_key_value_heads", config.num_attention_heads
         )
-
-        # config.rope_theta = 10000
-        # config.num_key_value_heads = 32
-        # assert 32 == config.num_attention_heads
-
         self.rotary_emb = PositionRotaryEmbedding.static(
             config=config,
             dim=self.head_size,
-            base = config.rope_theta if hasattr(config, 'rope_theta') else 10000,
+            base=config.rope_theta,
             device=weights.device,
         )
 
-        self.softmax_scale = self.head_size**-0.5
+        # `config.attention_multiplier` is used in Granite
+        self.softmax_scale = getattr(
+            config, "attention_multiplier", self.head_size**-0.5
+        )
 
         if self.num_heads % weights.process_group.size() != 0:
             raise ValueError(
@@ -152,16 +178,18 @@ class FlashLlamaAttention(torch.nn.Module):
         self.num_heads = self.num_heads // weights.process_group.size()
         self.num_key_value_heads = (
             config.num_key_value_heads // weights.process_group.size()
-        ) if hasattr(config,'num_key_value_heads') else (config.num_attention_heads // weights.process_group.size())
+        )
 
         self.query_key_value = load_attention(config, prefix, weights, index)
         self.index = index
 
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
         o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=False,
+            bias=getattr(config, "attention_bias", False),
         )
 
         self.o_proj = TensorParallelAdapterRowLinear.load(
@@ -182,10 +210,10 @@ class FlashLlamaAttention(torch.nn.Module):
         cos,
         sin,
         cu_seqlen_prefill,
-        kv_cache,
+        kv_cache: KVCache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         adapter_data,
     ):
@@ -202,35 +230,37 @@ class FlashLlamaAttention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
-
-        # output tensor
-        attn_output = torch.empty_like(query)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_scales=self.kv_scales,
+                kv_cache=kv_cache,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(
@@ -238,6 +268,42 @@ class FlashLlamaAttention(torch.nn.Module):
         )
 
 
+class Phi3MoE(nn.Module):
+    def __init__(
+        self, prefix: str, config, moe_layer_cls: Type[MoELayer], weights: Weights
+    ):
+        super().__init__()
+
+        # gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        self.moe = moe_layer_cls(
+            prefix=f"{prefix}.experts",
+            n_experts=config.num_local_experts,
+            n_expert_group=None,
+            renormalize=True,
+            topk=config.num_experts_per_tok,
+            topk_group=None,
+            weights=weights,
+            gate_proj_name="w1",
+            up_proj_name="w3",
+            down_proj_name="w2",
+        )
+
+        self.process_group = weights.process_group
+
+    def forward(self, x, adapter_data) -> torch.Tensor:
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(x)
+        out = self.moe(x, gating_output=router_logits)
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
 class LlamaMLP(nn.Module):
     def __init__(self, prefix, config, weights, index):
         super().__init__()
@@ -267,7 +333,7 @@ class LlamaMLP(nn.Module):
                 bias=bias,
             )
         else:
-            prefixes = [f"gate_proj", f"up_proj"]
+            prefixes = ["gate_proj", "up_proj"]
             sizes = [
                 config.intermediate_size,
                 config.intermediate_size,
@@ -309,12 +375,17 @@ class LlamaMLP(nn.Module):
         # TODO: This is a hotfix to be removed & properly refactored.
         self.quantize = config.quantize
 
+        self.hidden_size = config.hidden_size
+
     def forward(self, hidden_states, adapter_data):
         # if (
         #     SYSTEM == "rocm"
         #     and self.hidden_act == "silu"
+        #     and hidden_states.dtype == torch.float16
         #     and hidden_states.shape[0] == 1
         #     and not self.quantize
+        #     and self.hidden_size
+        #     != 16384  # TODO: Temporary workaround for `LLMM_Silu` kernel not working with LLama3.1 405B; needs refactoring once fixed.
         # ):
         #     out = torch.empty(
         #         hidden_states.shape[0],
@@ -322,11 +393,17 @@ class LlamaMLP(nn.Module):
         #         dtype=hidden_states.dtype,
         #         device="cuda",
         #     )
-        #     _custom_C.LLMM_Silu(
+        #     _custom_ops.LLMM_Silu(
         #         self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
         #     )
         #     return self.down_proj(out, adapter_data)
-        # else: #TODO:can try https://github.com/fxmarty/rocm-vllm/blob/main/csrc/custom/custom.cu
+        # else:
+        #     gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+        #     gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        #     return self.down_proj(
+        #         self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        #     )
+
         gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
         gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
         return self.down_proj(
@@ -337,24 +414,54 @@ class LlamaMLP(nn.Module):
 class FlashLlamaLayer(nn.Module):
     def __init__(self, index, prefix, config, weights):
         super().__init__()
-        self.self_attn = FlashLlamaAttention(
-            index=index,
-            prefix=f"{prefix}.self_attn",
-            config=config,
-            weights=weights,
-        )
-        self.mlp = LlamaMLP(
-            prefix=f"{prefix}.mlp", config=config, weights=weights, index=index
-        )
 
-        self.input_layernorm = FastRMSNorm.load(
-            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
-        )
-        self.post_attention_layernorm = FastRMSNorm.load(
-            prefix=f"{prefix}.post_attention_layernorm",
-            weights=weights,
-            eps=config.rms_norm_eps,
-        )
+        with no_fp8(weights):
+            self.self_attn = FlashLlamaAttention(
+                index=index,
+                prefix=f"{prefix}.self_attn",
+                config=config,
+                weights=weights,
+            )
+
+        if config.model_type == "phimoe":
+            moe_layer_cls = (
+                SparseMoELayer
+                if SparseMoELayer.is_supported(weights)
+                else DenseMoELayer
+            )
+            self.dense = Phi3MoE(
+                f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
+            )
+            # with moe the layernorms are are not rmsnorms and they have bias
+            self.input_layernorm = FastLayerNorm.load(
+                prefix=f"{prefix}.input_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.post_attention_layernorm = FastLayerNorm.load(
+                prefix=f"{prefix}.post_attention_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+        else:
+            self.dense = LlamaMLP(
+                prefix=f"{prefix}.mlp", config=config, weights=weights, index=index
+            )
+            self.input_layernorm = FastRMSNorm.load(
+                prefix=f"{prefix}.input_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.post_attention_layernorm = FastRMSNorm.load(
+                prefix=f"{prefix}.post_attention_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+
+        # Used in Granite
+        # This could eventually be baked into the weights like we do for the embeddings/lm_head
+        # but this would mean modifying the lora code
+        self.residual_multiplier = getattr(config, "residual_multiplier", None)
 
     def forward(
         self,
@@ -366,9 +473,10 @@ class FlashLlamaLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         adapter_data,
+        cross_attention_states,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -381,17 +489,20 @@ class FlashLlamaLayer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             adapter_data,
         )
+        if self.residual_multiplier is not None:
+            attn_output *= self.residual_multiplier
 
-        # faster post attention rms norm
         normed_attn_res_output, attn_res = self.post_attention_layernorm(
             attn_output, res
         )
 
-        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
+        mlp_output = self.dense(normed_attn_res_output, adapter_data)
+        if self.residual_multiplier is not None:
+            mlp_output *= self.residual_multiplier
 
         return mlp_output, attn_res
 
@@ -403,21 +514,70 @@ class FlashLlamaModel(torch.nn.Module):
         process_group = weights.process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
-        self.layers = nn.ModuleList(
-            [
+
+        # Skip fp8 quant for first and last layers
+        self.layers = nn.ModuleList()
+        self.cross_attention_layers = getattr(config, "cross_attention_layers", [])
+        with no_fp8(weights):
+            self.layers.append(
+                FlashLlamaLayer(
+                    index=0,
+                    prefix=(
+                        "model.layers.0" if not prefix else f"{prefix}.model.layers.0"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+            )
+
+        # Skip first and last layers
+        for layer_id in range(1, config.num_hidden_layers - 1):
+            if layer_id in self.cross_attention_layers:
+                from text_generation_server.models.custom_modeling.mllama import (
+                    FlashLlamaCrossLayer,
+                )
+
+                self.layers.append(
+                    FlashLlamaCrossLayer(
+                        index=layer_id,
+                        prefix=(
+                            f"model.layers.{layer_id}"
+                            if not prefix
+                            else f"{prefix}.model.layers.{layer_id}"
+                        ),
+                        config=config,
+                        weights=weights,
+                    )
+                )
+            else:
+                self.layers.append(
+                    FlashLlamaLayer(
+                        index=layer_id,
+                        prefix=(
+                            f"model.layers.{layer_id}"
+                            if not prefix
+                            else f"{prefix}.model.layers.{layer_id}"
+                        ),
+                        config=config,
+                        weights=weights,
+                    )
+                )
+
+        with no_fp8(weights):
+            last_layer_id = config.num_hidden_layers - 1
+            self.layers.append(
                 FlashLlamaLayer(
-                    index=layer_id,
+                    index=last_layer_id,
                     prefix=(
-                        f"model.layers.{layer_id}"
+                        f"model.layers.{last_layer_id}"
                         if not prefix
-                        else f"{prefix}.model.layers.{layer_id}"
+                        else f"{prefix}.model.layers.{last_layer_id}"
                     ),
                     config=config,
                     weights=weights,
                 )
-                for layer_id in range(config.num_hidden_layers)
-            ]
-        )
+            )
+
         self.norm = FastRMSNorm.load(
             prefix="model.norm" if not prefix else f"{prefix}.model.norm",
             weights=weights,
@@ -438,11 +598,12 @@ class FlashLlamaModel(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         true_max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         adapter_data,
+        cross_attention_states=None,
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
@@ -463,9 +624,10 @@ class FlashLlamaModel(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
                 adapter_data,
+                cross_attention_states,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -474,26 +636,45 @@ class FlashLlamaModel(torch.nn.Module):
 
 
 class FlashLlamaForCausalLM(torch.nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
-        self.embed_tokens = TensorParallelEmbedding(
-            prefix=(
-                "model.embed_tokens" if not prefix else f"{prefix}.model.embed_tokens"
-            ),
-            weights=weights,
-        )
+        with no_fp8(weights):
+            self.embed_tokens = TensorParallelEmbedding(
+                prefix=(
+                    "model.embed_tokens"
+                    if not prefix
+                    else f"{prefix}.model.embed_tokens"
+                ),
+                weights=weights,
+            )
         self.model = FlashLlamaModel(prefix, config, weights)
         if config.tie_word_embeddings:
             suffix = "model.embed_tokens"
         else:
             suffix = "lm_head"
 
-        self.lm_head = SpeculativeHead.load(
-            config,
-            prefix=suffix if not prefix else f"{prefix}.{suffix}",
-            weights=weights,
-        )
+        # Used in Granite
+        embedding_multiplier = getattr(config, "embedding_multiplier", None)
+        if embedding_multiplier is not None:
+            self.embed_tokens.weight.data *= embedding_multiplier
+
+        with no_fp8(weights):
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix=suffix if not prefix else f"{prefix}.{suffix}",
+                weights=weights,
+            )
+
+        # Used in Granite
+        self.logits_scaling = getattr(config, "logits_scaling", None)
+        if self.logits_scaling is not None and self.lm_head.head is not None:
+            try:
+                # Scale the weights directly
+                self.lm_head.head.linear.weight.data /= self.logits_scaling
+                self.logits_scaled = True
+            except Exception:
+                self.logits_scaled = False
 
     def forward(
         self,
@@ -503,11 +684,12 @@ class FlashLlamaForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
+        cross_attention_states=None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = self.model(
@@ -517,13 +699,21 @@ class FlashLlamaForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             true_max_s=max_s,
             prefill_cache_indices=prefill_cache_indices,
             adapter_data=adapter_data,
+            cross_attention_states=cross_attention_states,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
         logits, speculative_logits = self.lm_head(hidden_states)
+
+        # Used in Granite
+        if self.logits_scaling is not None and not self.logits_scaled:
+            logits /= self.logits_scaling
+            if speculative_logits is not None:
+                speculative_logits /= self.logits_scaling
+
         return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index b9d9a79ceb5e00877e5be9a9d19284d8dbdd3b10..7a7760930fe8707183de184e99cd58c036157a36 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -26,19 +26,18 @@ from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
-    Seqlen,
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     SpeculativeHead,
-    get_linear,
     TensorParallelMultiAdapterLinear,
     TensorParallelAdapterRowLinear,
 )
@@ -118,7 +117,10 @@ class MistralAttention(torch.nn.Module):
         )
         self.num_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_heads
+        if hasattr(config, "head_dim"):
+            self.head_size = config.head_dim
+        else:
+            self.head_size = self.hidden_size // self.num_heads
 
         self.rotary_emb = PositionRotaryEmbedding.static(
             config=config,
@@ -147,18 +149,18 @@ class MistralAttention(torch.nn.Module):
             bias=False,
         )
 
-        head_size = config.hidden_size // config.num_attention_heads
         self.query_key_value = TensorParallelMultiAdapterLinear.load(
             query_key_value,
             layer_id,
             ["q_proj", "k_proj", "v_proj"],
             sizes=[
-                head_size * config.num_attention_heads,
-                head_size * config.num_key_value_heads,
-                head_size * config.num_key_value_heads,
+                self.head_size * config.num_attention_heads,
+                self.head_size * config.num_key_value_heads,
+                self.head_size * config.num_key_value_heads,
             ],
             process_group=weights.process_group,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         o_proj = TensorParallelRowLinear.load(
             config,
@@ -186,7 +188,7 @@ class MistralAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         prefill_cache_indices,
         adapter_data,
@@ -209,38 +211,38 @@ class MistralAttention(torch.nn.Module):
         else:
             kv_to_cache = kv
 
-        reshape_and_cache(
-            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=kv_to_cache[:, 0],
+            value=kv_to_cache[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
-        # output tensor
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv_to_cache[:, 0],
+                value=kv_to_cache[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(
@@ -249,7 +251,7 @@ class MistralAttention(torch.nn.Module):
 
 
 class MistralMLP(nn.Module):
-    def __init__(self, prefix, config, weights, layer_id):
+    def __init__(self, prefix: str, config, weights, layer_id):
         super().__init__()
         self.hidden_act = config.hidden_act
         self.act = (
@@ -307,16 +309,17 @@ class MistralMLP(nn.Module):
         if (
             SYSTEM == "rocm"
             and self.hidden_act == "silu"
+            and hidden_states.dtype == torch.float16
             and hidden_states.shape[0] == 1
             and not self.quantize
-        ): #TODO:when you try to run mistral model, please try https://github.com/fxmarty/rocm-vllm/blob/main/csrc/custom/custom.cu 
+        ):
             out = torch.empty(
                 hidden_states.shape[0],
                 self.intermediate_size,
                 dtype=hidden_states.dtype,
                 device="cuda",
             )
-            _custom_C.LLMM_Silu(
+            _custom_ops.LLMM_Silu(
                 self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
             )
             return self.down_proj(out, adapter_data)
@@ -329,7 +332,7 @@ class MistralMLP(nn.Module):
 
 
 class MistralLayer(nn.Module):
-    def __init__(self, prefix, config, weights, layer_id):
+    def __init__(self, prefix: str, config, weights, layer_id):
         super().__init__()
         self.self_attn = MistralAttention(
             prefix=f"{prefix}.self_attn",
@@ -360,7 +363,7 @@ class MistralLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         prefill_cache_indices,
         adapter_data,
@@ -376,7 +379,7 @@ class MistralLayer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             prefill_cache_indices,
             adapter_data,
@@ -393,7 +396,7 @@ class MistralLayer(nn.Module):
 
 
 class MistralModel(torch.nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
         process_group = weights.process_group
@@ -428,7 +431,7 @@ class MistralModel(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         true_max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
@@ -452,7 +455,7 @@ class MistralModel(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
                 prefill_cache_indices,
                 adapter_data,
@@ -463,7 +466,7 @@ class MistralModel(torch.nn.Module):
 
 
 class FlashMistralForCausalLM(torch.nn.Module):
-    def __init__(self, prefix, config, weights, name=None):
+    def __init__(self, prefix: str, config, weights, name=None):
         if name is None:
             name = "model"
         super().__init__()
@@ -503,7 +506,7 @@ class FlashMistralForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -516,7 +519,7 @@ class FlashMistralForCausalLM(torch.nn.Module):
         elif self.max_past is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
+            seqlen = seqlen.clamp(max=self.max_past_tensor)
 
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = self.model(
@@ -526,7 +529,7 @@ class FlashMistralForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             true_max_s,
             prefill_cache_indices,
diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
index 2d6a7f972e2b35d48234db8b87cce91433c580a9..a45dd1e615ef5cd3acb7ab27e4e18dd61174b9ce 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -18,40 +18,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List, Optional, Tuple, Type
+
 import torch
 import torch.distributed
-
-import numpy as np
-
 from torch import nn
-from text_generation_server.utils.import_utils import SYSTEM
-
-if SYSTEM != "ipex":
-    from vllm.model_executor.layers.fused_moe import fused_moe
-from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
-from typing import Optional, List, Tuple
-from loguru import logger
 
-from text_generation_server.layers.attention import (
-    paged_attention,
-    attention,
-    reshape_and_cache,
-)
 from text_generation_server.layers import (
     FastLinear,
-    TensorParallelRowLinear,
+    SpeculativeHead,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    SpeculativeHead,
+    TensorParallelRowLinear,
     get_linear,
 )
-from text_generation_server.layers.layernorm import (
-    FastRMSNorm,
-)
-from text_generation_server.layers.rotary import (
-    PositionRotaryEmbedding,
+from text_generation_server.layers.attention import (
+    Seqlen,
+    attention,
+    paged_attention,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
+from text_generation_server.layers.layernorm import FastRMSNorm
+from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.utils.weights import UnquantizedWeight
 
 
 class MixtralConfig(PretrainedConfig):
@@ -116,7 +107,7 @@ def promote_scalar(x: torch.Tensor) -> torch.Tensor:
     return x.view(1) if len(x.size()) == 0 else x
 
 
-def load_attention(config, prefix, weights):
+def load_attention(config, prefix: str, weights):
     if config.num_attention_heads != config.num_key_value_heads:
         return _load_gqa(config, prefix, weights)
     else:
@@ -135,27 +126,24 @@ def _load_gqa(config, prefix: str, weights):
 
     weight = weights.get_multi_weights_col(
         prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
         dim=0,
     )
 
-    if config.quantize not in ["gptq", "awq", "marlin"]:
-        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
 
         head_size = config.hidden_size // config.num_attention_heads
         num_heads = config.num_attention_heads // weights.process_group.size()
         num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
-        assert list(weight.shape) == [
+        assert list(weight.weight.shape) == [
             (num_heads + 2 * num_key_value_heads) * head_size,
             config.hidden_size,
-        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=None, quantize=config.quantize)
-    )
+    return TensorParallelColumnLinear(get_linear(weight, bias=None))
 
 
-def _load_experts(config, prefix, mat, weights):
+def _load_experts(config, prefix: str, mat, weights):
     if config.quantize is not None:
         raise NotImplementedError("Mixtral does not support weight quantization yet.")
 
@@ -226,6 +214,7 @@ class MixtralAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -247,7 +236,7 @@ class MixtralAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         prefill_cache_indices,
     ):
@@ -269,38 +258,38 @@ class MixtralAttention(torch.nn.Module):
         else:
             kv_to_cache = kv
 
-        reshape_and_cache(
-            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=kv_to_cache[:, 0],
+            value=kv_to_cache[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
-        # output tensor
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv_to_cache[:, 0],
+                value=kv_to_cache[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -324,59 +313,35 @@ def round_up(x: torch.Tensor, value: int):
     return torch.div(x + (value - 1), value, rounding_mode="trunc") * value
 
 
-class BlockSparseMoE(nn.Module):
-    def __init__(self, prefix, config: MixtralConfig, weights):
+class MixtralMoE(nn.Module):
+    def __init__(
+        self, prefix, config: MixtralConfig, moe_layer_cls: Type[MoELayer], weights
+    ):
         super().__init__()
-        self.hidden_dim = config.hidden_size
-        self.ffn_dim = config.intermediate_size // weights.process_group.size()
-        self.num_experts = config.num_local_experts
-        self.top_k = config.num_experts_per_tok
-
-        act = config.hidden_act
-        if "gelu" in act:
-            self.act = lambda x: torch.nn.functional.gelu(
-                x,
-                approximate=(
-                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
-                ),
-            )
-        elif "silu" in act:
-            self.act = torch.nn.functional.silu
-        else:
-            self.act = ACT2FN[act]
 
         # gating
         self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
 
-        # merged expert weights, all of size  (n_experts * ffn_dim, hidden_dim)
-        w1 = _load_experts(config, f"{prefix}.experts", "w1", weights).view(
-            self.num_experts, self.ffn_dim, self.hidden_dim
-        )
-        w3 = _load_experts(config, f"{prefix}.experts", "w3", weights).view(
-            self.num_experts, self.ffn_dim, self.hidden_dim
-        )
-        self.w13 = torch.cat([w1, w3], dim=1)
-        self.w2 = (
-            _load_experts(config, f"{prefix}.experts", "w2", weights)
-            .view(self.num_experts, self.ffn_dim, self.hidden_dim)
-            .transpose(1, 2)
-            .contiguous()
+        self.moe = moe_layer_cls(
+            n_expert_group=None,
+            n_experts=config.num_local_experts,
+            prefix=f"{prefix}.experts",
+            renormalize=True,
+            topk=config.num_experts_per_tok,
+            topk_group=None,
+            weights=weights,
+            gate_proj_name="w1",
+            up_proj_name="w3",
+            down_proj_name="w2",
         )
+        assert isinstance(self.moe, MoELayer)
 
         self.process_group = weights.process_group
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(x)
-        out = fused_moe(
-            x,
-            self.w13,
-            self.w2,
-            router_logits,
-            self.top_k,
-            renormalize=True,
-            inplace=True,
-        )
+        out = self.moe(x, gating_output=router_logits)
 
         # Reduce sum
         if self.process_group.size() > 1:
@@ -385,97 +350,8 @@ class BlockSparseMoE(nn.Module):
         return out.view(*x.shape)
 
 
-class DenseMoE(nn.Module):
-    def __init__(self, prefix, config: MixtralConfig, weights):
-        super().__init__()
-        self.hidden_dim = config.hidden_size
-        self.ffn_dim = config.intermediate_size // weights.process_group.size()
-        self.num_experts = config.num_local_experts
-        self.top_k = config.num_experts_per_tok
-
-        act = config.hidden_act
-        if "gelu" in act:
-            self.act = lambda x: torch.nn.functional.gelu(
-                x,
-                approximate=(
-                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
-                ),
-            )
-        elif "silu" in act:
-            self.act = torch.nn.functional.silu
-        else:
-            self.act = ACT2FN[act]
-
-        # gating
-        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
-
-        self.w1 = [
-            TensorParallelColumnLinear.load(
-                config, prefix=f"{prefix}.experts.{i}.w1", weights=weights, bias=False
-            )
-            for i in range(self.num_experts)
-        ]
-        self.w3 = [
-            TensorParallelColumnLinear.load(
-                config, prefix=f"{prefix}.experts.{i}.w3", weights=weights, bias=False
-            )
-            for i in range(self.num_experts)
-        ]
-        self.w2 = [
-            TensorParallelRowLinear.load(
-                config, prefix=f"{prefix}.experts.{i}.w2", weights=weights, bias=False
-            )
-            for i in range(self.num_experts)
-        ]
-
-        self.process_group = weights.process_group
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        x: (sequence_length, model_dim)
-        gate_logits: (sequence_length, n_experts)
-        """
-        # optional reshape
-        input_shape = x.shape
-        x = x.view(-1, input_shape[-1])
-
-        # gate_logits: (sequence_length, n_experts)
-        gate_logits = self.gate(x)
-        # all_probs: (sequence_length, n_experts) and upcast for softmax
-        all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
-
-        if self.top_k < self.num_experts:
-            _, not_selected_experts = torch.topk(
-                all_probs,
-                self.num_experts - self.top_k,
-                largest=False,
-                sorted=False,
-                dim=1,
-            )
-            # Mask not selected experts
-            all_probs.scatter_(1, not_selected_experts, 0)
-
-        # Re-normalize
-        weights = all_probs / all_probs.sum(dim=1, keepdim=True)
-        weights = weights.to(x.dtype)
-
-        # Final output tensor
-        out = x.new_zeros(x.shape[0], self.hidden_dim)
-        for i in range(self.num_experts):
-            h = self.act(self.w1[i](x)) * self.w3[i](x)
-            h = self.w2[i](h, reduce=False)
-            # Add expert output to out with masking
-            out += h * weights[:, i].view(-1, 1)
-
-        # Reduce sum
-        if self.process_group.size() > 1:
-            torch.distributed.all_reduce(out, group=self.process_group)
-
-        return out
-
-
 class MixtralLayer(nn.Module):
-    def __init__(self, prefix, layer_id, config, weights):
+    def __init__(self, prefix: str, layer_id, config, weights):
         super().__init__()
         prefix = f"{prefix}.layers.{layer_id}"
 
@@ -483,8 +359,12 @@ class MixtralLayer(nn.Module):
             prefix=f"{prefix}.self_attn", config=config, weights=weights
         )
 
-        moe_cls = BlockSparseMoE if config.quantize is None else DenseMoE
-        self.moe = moe_cls(f"{prefix}.block_sparse_moe", config, weights)
+        moe_layer_cls = (
+            SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
+        )
+        self.moe = MixtralMoE(
+            f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
+        )
 
         self.input_layernorm = FastRMSNorm.load(
             prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
@@ -505,7 +385,7 @@ class MixtralLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         prefill_cache_indices,
     ):
@@ -520,7 +400,7 @@ class MixtralLayer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             prefill_cache_indices,
         )
@@ -536,7 +416,7 @@ class MixtralLayer(nn.Module):
 
 
 class MixtralModel(torch.nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
         self.embed_tokens = TensorParallelEmbedding(
@@ -575,7 +455,7 @@ class MixtralModel(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         true_max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
@@ -599,7 +479,7 @@ class MixtralModel(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
                 prefill_cache_indices,
             )
@@ -610,7 +490,7 @@ class MixtralModel(torch.nn.Module):
 
 
 class FlashMixtralForCausalLM(torch.nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
         self.model = MixtralModel(prefix, config, weights)
@@ -634,7 +514,7 @@ class FlashMixtralForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -647,7 +527,7 @@ class FlashMixtralForCausalLM(torch.nn.Module):
         elif self.max_past is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
+            seqlen = seqlen.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,
@@ -656,7 +536,7 @@ class FlashMixtralForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             true_max_s,
             prefill_cache_indices,
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index 33aebc2be3857af5268f1d4530676bfc86c97911..2301b63cff2fd1a68714f8d69eacc5697923c076 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -24,13 +24,12 @@ import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
-from transformers.models.gpt_neox import GPTNeoXConfig
+from transformers.models.gpt_neox import GPTNeoXConfig as TransformersGPTNeoXConfig
 from typing import Optional, List, Tuple
-
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -39,16 +38,24 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
 from text_generation_server.layers.rotary import (
     PositionRotaryEmbedding,
 )
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+class GPTNeoXConfig(TransformersGPTNeoXConfig):
+    attribute_map = {
+        "num_key_value_heads": "num_attention_heads",
+    }
 
 
 def load_row(config, prefix: str, weights, bias: bool):
-    weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+    weight = weights.get_weights_row(prefix)
 
     if bias and weights.process_group.rank() == 0:
         # Rank is only on the first rank process
@@ -56,7 +63,7 @@ def load_row(config, prefix: str, weights, bias: bool):
     else:
         bias = None
 
-    linear = get_linear(weight, bias, config.quantize)
+    linear = get_linear(weight, bias)
     if config.use_parallel_residual:
         return linear
     else:
@@ -64,11 +71,11 @@ def load_row(config, prefix: str, weights, bias: bool):
 
 
 def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_size):
-    weight = weights.get_multi_weights_col([prefix], quantize=config.quantize, dim=0)
-    if isinstance(weight, torch.Tensor):
+    weight = weights.get_multi_weights_col([prefix], dim=0)
+    if isinstance(weight, UnquantizedWeight):
         # Only on non quantized versions
-        weight = (
-            weight.view(
+        weight.weight = (
+            weight.weight.view(
                 num_heads,
                 3,
                 head_size,
@@ -81,7 +88,7 @@ def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_size):
     bias = weights.get_sharded(f"{prefix}.bias", dim=0)
     bias = bias.view(num_heads, 3, head_size).permute(1, 0, 2).reshape(-1)
 
-    linear = get_linear(weight, bias, config.quantize)
+    linear = get_linear(weight, bias)
     if config.use_parallel_residual:
         return linear
     else:
@@ -124,6 +131,7 @@ class FlashNeoxAttention(torch.nn.Module):
             head_size=self.head_size,
             hidden_size=self.hidden_size,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=True
         )
@@ -140,44 +148,54 @@ class FlashNeoxAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         qkv = self.query_key_value(hidden_states)
         qkv = qkv.view(-1, 3, self.num_heads, self.head_size)
 
-        # Inplace rotary
-        self.rotary_emb(qkv[:, 0], qkv[:, 1], cos, sin)
-
-        reshape_and_cache(qkv[:, 1], qkv[:, 2], kv_cache[0], kv_cache[1], slots)
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = qkv[:, 0][..., : self.rotary_dim]
+        query_pass = qkv[:, 0][..., self.rotary_dim :]
+        key_rot = qkv[:, 1][..., : self.rotary_dim]
+        key_pass = qkv[:, 1][..., self.rotary_dim :]
 
-        # output tensor
-        attn_output = torch.empty_like(qkv[:, 0])
+        # Inplace rotary
+        self.rotary_emb(query_rot, key_rot, cos, sin)
+        qkv[:, 0] = torch.cat((query_rot, query_pass), dim=-1)
+        qkv[:, 1] = torch.cat((key_rot, key_pass), dim=-1)
+
+        kv_cache.store(
+            key=qkv[:, 1],
+            value=qkv[:, 2],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                qkv[:, 0],
-                qkv[:, 1],
-                qkv[:, 2],
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=qkv[:, 0],
+                key=qkv[:, 1],
+                value=qkv[:, 2],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 qkv[:, 0],
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
@@ -246,7 +264,7 @@ class FlashNeoXLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         if self.use_parallel_residual:
@@ -260,7 +278,7 @@ class FlashNeoXLayer(nn.Module):
                 kv_cache,
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -284,7 +302,7 @@ class FlashNeoXLayer(nn.Module):
                 kv_cache,
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -305,12 +323,12 @@ class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
 
 
 class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
         self.config = config
 
         self.embed_in = TensorParallelEmbedding(
-            prefix="gpt_neox.embed_in", weights=weights
+            prefix=f"{prefix}.embed_in", weights=weights
         )
 
         self.layers = nn.ModuleList(
@@ -320,7 +338,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
             ]
         )
         self.final_layer_norm = FastLayerNorm.load(
-            prefix="gpt_neox.final_layer_norm",
+            prefix=f"{prefix}.final_layer_norm",
             weights=weights,
             eps=config.layer_norm_eps,
         )
@@ -338,7 +356,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
     ) -> torch.Tensor:
         hidden_states = self.embed_in(input_ids)
@@ -360,7 +378,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -370,9 +388,15 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
 
 
 class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__(config)
-        self.gpt_neox = FlashGPTNeoXModel(config, weights)
+
+        if not prefix:
+            prefix = "gpt_neox"
+        else:
+            prefix = f"{prefix}.gpt_neox"
+
+        self.gpt_neox = FlashGPTNeoXModel(prefix, config, weights)
 
         self.embed_out = SpeculativeHead.load(
             config, prefix="embed_out", weights=weights
@@ -386,7 +410,7 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -399,7 +423,7 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
         if lm_head_indices is not None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
index 1f998e5a850aa43fb6056067276e37e76c8265d2..0024f2bb92b76da74b3d8c47e000164bac57e746 100644
--- a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
@@ -16,10 +16,10 @@
 import torch
 import torch.distributed
 from torch import nn
-from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
 from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear
+from text_generation_server.layers.attention import Seqlen
 from text_generation_server.models.custom_modeling.vlm import (
     load_text_model,
     load_vision_model,
@@ -35,6 +35,11 @@ class PaliGemmaForConditionalGeneration(nn.Module):
             config=config.vision_config,
             weights=weights,
         )
+        self.post_vision_tower_layernorm = nn.LayerNorm.load(
+            prefix="vision_tower.vision_model.post_layernorm",
+            weights=weights,
+            eps=config.vision_config.layer_norm_eps,
+        )
 
         self.multi_modal_projector = TensorParallelColumnLinear.load(
             config,
@@ -43,7 +48,7 @@ class PaliGemmaForConditionalGeneration(nn.Module):
             bias=True,
         )
 
-        self.vocab_size = config.vocab_size
+        self.vocab_size = config.text_config.vocab_size
         self.config = config
 
         text_config = config.text_config
@@ -66,7 +71,7 @@ class PaliGemmaForConditionalGeneration(nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -85,7 +90,10 @@ class PaliGemmaForConditionalGeneration(nn.Module):
         if pixel_values is not None:
             pixel_values = pixel_values.to(dtype=inputs_embeds.dtype)
             image_outputs = self.vision_tower(pixel_values)
-            image_features = self.multi_modal_projector(image_outputs.last_hidden_state)
+            last_hidden_state = self.post_vision_tower_layernorm(
+                image_outputs.last_hidden_state
+            )
+            image_features = self.multi_modal_projector(last_hidden_state)
 
             # mask where image or padding tokens
             mask = input_ids == self.config.image_token_index
@@ -100,7 +108,7 @@ class PaliGemmaForConditionalGeneration(nn.Module):
             kv_cache=kv_cache,
             block_tables=block_tables,
             slots=slots,
-            input_lengths=input_lengths,
+            seqlen=seqlen,
             max_s=max_s,
         )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
index f237ea37e0fd67f7c5981ae89013b0e81bd9fad8..7382a7cb9fccede53bdced95ee9fc0477422c199 100644
--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -9,7 +9,7 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -18,6 +18,7 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
@@ -85,7 +86,6 @@ def _load_gqa(config, prefix: str, weights):
 
     weight = weights.get_multi_weights_col(
         prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
         dim=0,
     )
 
@@ -101,9 +101,7 @@ def _load_gqa(config, prefix: str, weights):
         ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
     # this is the same as llama except for Phi uses bias=True
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=True, quantize=config.quantize)
-    )
+    return TensorParallelColumnLinear(get_linear(weight, bias=True))
 
 
 class FlashPhiAttention(torch.nn.Module):
@@ -140,6 +138,7 @@ class FlashPhiAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         # in llama the dense layer is called "o_proj" and has bias=False
         self.dense = TensorParallelRowLinear.load(
@@ -162,7 +161,7 @@ class FlashPhiAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         # Compute query, key, value and split
@@ -189,34 +188,36 @@ class FlashPhiAttention(torch.nn.Module):
         )
 
         # Reshape key and value and cache
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
-
-        # output tensor
-        attn_output = torch.empty_like(query)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_scales=self.kv_scales,
+                kv_cache=kv_cache,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
@@ -258,9 +259,9 @@ class PhiMLP(nn.Module):
 
 
 class FlashPhiLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, prefix: str, layer_id, config, weights):
         super().__init__()
-        prefix = f"model.layers.{layer_id}"
+        prefix = f"{prefix}.layers.{layer_id}"
         self.self_attn = FlashPhiAttention(
             prefix=f"{prefix}.self_attn", config=config, weights=weights
         )
@@ -282,7 +283,7 @@ class FlashPhiLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         hidden_states, res = self.input_layernorm(hidden_states, residual)
@@ -295,7 +296,7 @@ class FlashPhiLayer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
 
@@ -307,18 +308,19 @@ class FlashPhiLayer(nn.Module):
 
 
 class FlashPhiModel(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
         process_group = weights.process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
         self.embed_tokens = TensorParallelEmbedding(
-            prefix="model.embed_tokens", weights=weights
+            prefix=f"{prefix}.embed_tokens", weights=weights
         )
         self.layers = nn.ModuleList(
             [
                 FlashPhiLayer(
+                    prefix,
                     layer_id,
                     config,
                     weights,
@@ -346,7 +348,7 @@ class FlashPhiModel(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
@@ -368,7 +370,7 @@ class FlashPhiModel(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -378,10 +380,15 @@ class FlashPhiModel(torch.nn.Module):
 
 
 class FlashPhiForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
-        self.model = FlashPhiModel(config, weights)
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = FlashPhiModel(prefix, config, weights)
         self.lm_head = SpeculativeHead.load(
             config,
             prefix="lm_head",
@@ -396,7 +403,7 @@ class FlashPhiForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -409,7 +416,7 @@ class FlashPhiForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
         if lm_head_indices is not None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb585cc4bf6d0bc77bbf2c3e5038fcd6114838dd
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch Phi-MoE model."""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+PHIMOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/Phi-3.5-MoE-instruct": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/resolve/main/config.json",
+}
+
+
+class PhiMoEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PhiMoEModel`]. It is used to instantiate a Phi-MoE
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32064):
+            Vocabulary size of the PhiMoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`PhiMoEModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 6400):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor`, `long_factor`, `short_mscale`, `long_mscale` and
+            `original_max_position_embeddings`. The `type` must be `longrope`, the `short_mscale` and `long_scale` must
+            be numbers, the `short_factor` and `long_factor` must be lists of numbers with the same length as half of
+            the attention head size and the `original_max_position_embeddings` must be an integer.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `262144`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 16):
+            Number of experts per Sparse MLP layer.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.0):
+            The aux loss factor for the total loss.
+        router_jitter_noise (`float`, *optional*, defaults to 0.01):
+            Amount of noise to add to the router.
+
+    ```python
+    >>> from transformers import PhiMoEModel, PhiMoEConfig
+
+    >>> # Initializing a Phi-3 style configuration
+    >>> configuration = PhiMoEConfig.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+
+    >>> # Initializing a model from the configuration
+    >>> model = PhiMoEModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "phimoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=4096,
+        intermediate_size=6400,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        rope_scaling=None,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=16,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.01,
+        input_jitter_noise=0.0,
+        attention_bias=False,
+        lm_head_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.attention_bias = attention_bias
+        self.lm_head_bias = lm_head_bias
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        self.input_jitter_noise = input_jitter_noise
+
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 6:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor`, `long_factor`, "
+                f"`short_mscale`, `long_mscale` and `original_max_position_embeddings`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        rope_scaling_short_mscale = self.rope_scaling.get("short_mscale", None)
+        rope_scaling_long_mscale = self.rope_scaling.get("long_mscale", None)
+        original_max_position_embeddings = self.rope_scaling.get(
+            "original_max_position_embeddings", None
+        )
+        if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}"
+            )
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        if (
+            not len(rope_scaling_short_factor)
+            == self.hidden_size // self.num_attention_heads // 2
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if (
+            not len(rope_scaling_long_factor)
+            == self.hidden_size // self.num_attention_heads // 2
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+            )
+        if not isinstance(rope_scaling_short_mscale, (int, float)):
+            raise ValueError(
+                f"`rope_scaling`'s short_mscale field must be a number, got {rope_scaling_short_mscale}"
+            )
+        if not isinstance(rope_scaling_long_mscale, (int, float)):
+            raise ValueError(
+                f"`rope_scaling`'s long_mscale field must be a number, got {rope_scaling_long_mscale}"
+            )
+        if not isinstance(original_max_position_embeddings, int):
+            raise ValueError(
+                f"`rope_scaling`'s original_max_position_embeddings field must be an integer, got {original_max_position_embeddings}"
+            )
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index 1cc6a613dc13f4f4a6e896d0c4ef641159ca7675..ab2a177db6a63b4d168a6b9cc3b9b375b89bada8 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -8,15 +8,15 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     SpeculativeHead,
-    get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
@@ -85,6 +85,8 @@ class Qwen2Attention(torch.nn.Module):
 
         self.query_key_value = load_attention(config, prefix, weights)
 
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
         self.o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
@@ -105,7 +107,7 @@ class Qwen2Attention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         prefill_cache_indices,
     ):
@@ -127,38 +129,38 @@ class Qwen2Attention(torch.nn.Module):
         else:
             kv_to_cache = kv
 
-        reshape_and_cache(
-            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=kv_to_cache[:, 0],
+            value=kv_to_cache[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
-        # output tensor
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv_to_cache[:, 0],
+                value=kv_to_cache[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -203,9 +205,9 @@ class Qwen2MLP(nn.Module):
 
 
 class Qwen2Layer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, prefix, layer_id, config, weights):
         super().__init__()
-        prefix = f"model.layers.{layer_id}"
+        prefix = f"{prefix}.layers.{layer_id}"
         self.self_attn = Qwen2Attention(
             prefix=f"{prefix}.self_attn", config=config, weights=weights
         )
@@ -229,7 +231,7 @@ class Qwen2Layer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         prefill_cache_indices,
     ):
@@ -244,7 +246,7 @@ class Qwen2Layer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             prefill_cache_indices,
         )
@@ -260,17 +262,21 @@ class Qwen2Layer(nn.Module):
 
 
 class Qwen2Model(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
+
+        prefix = f"{prefix}.model" if prefix else "model"
+
         process_group = weights.process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
         self.embed_tokens = TensorParallelEmbedding(
-            prefix="model.embed_tokens", weights=weights
+            prefix=f"{prefix}.embed_tokens", weights=weights
         )
         self.layers = nn.ModuleList(
             [
                 Qwen2Layer(
+                    prefix,
                     layer_id,
                     config,
                     weights,
@@ -279,7 +285,7 @@ class Qwen2Model(torch.nn.Module):
             ]
         )
         self.norm = FastRMSNorm.load(
-            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
         )
 
         self.gradient_checkpointing = False
@@ -296,7 +302,7 @@ class Qwen2Model(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         true_max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
@@ -320,7 +326,7 @@ class Qwen2Model(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
                 prefill_cache_indices,
             )
@@ -331,13 +337,19 @@ class Qwen2Model(torch.nn.Module):
 
 
 class Qwen2ForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
-        self.model = Qwen2Model(config, weights)
+        self.model = Qwen2Model(prefix, config, weights)
+
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
         self.lm_head = SpeculativeHead.load(
             config,
-            prefix="lm_head",
+            prefix=f"{prefix}.{suffix}" if prefix else suffix,
             weights=weights,
         )
         self.max_past = config.sliding_window
@@ -355,7 +367,7 @@ class Qwen2ForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -368,7 +380,7 @@ class Qwen2ForCausalLM(torch.nn.Module):
         elif self.max_past is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
+            seqlen = seqlen.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,
@@ -377,7 +389,7 @@ class Qwen2ForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             true_max_s,
             prefill_cache_indices,
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
index e7614232290181f403808f72a99422e10c945e25..2dcd1bf3033d4d19221751334e03a03d8ed8bffa 100644
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -5,7 +5,6 @@ import torch.distributed
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_utils import PreTrainedModel
-
 from text_generation_server.layers import (
     SpeculativeHead,
     TensorParallelColumnLinear,
@@ -13,17 +12,18 @@ from text_generation_server.layers import (
     TensorParallelRowLinear,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import FastLayerNorm
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.attention import (
     attention,
     paged_attention,
-    reshape_and_cache,
+    Seqlen,
 )
 
 
 def load_row(config, prefix: str, weights, bias: bool):
-    weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+    weight = weights.get_weights_row(prefix)
 
     if bias and weights.process_group.rank() == 0:
         # Rank is only on the first rank process
@@ -31,7 +31,7 @@ def load_row(config, prefix: str, weights, bias: bool):
     else:
         bias = None
 
-    linear = get_linear(weight, bias, config.quantize)
+    linear = get_linear(weight, bias)
     if config.parallel_attn:
         return linear
     else:
@@ -42,6 +42,7 @@ class RWConfig(PretrainedConfig):
     attribute_map = {
         "num_hidden_layers": "n_layer",
         "num_attention_heads": "n_head",
+        "num_key_value_heads": "n_head_kv",
     }
 
     def __init__(
@@ -93,7 +94,7 @@ class RWConfig(PretrainedConfig):
             else kwargs.pop("n_head", 8)
         )
         self.layer_norm_epsilon = layer_norm_epsilon
-        self.num_ln_in_parallel_attention = num_ln_in_prallel_attention
+        self.num_ln_in_parallel_attn = num_ln_in_prallel_attention
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.hidden_dropout = hidden_dropout
@@ -127,7 +128,7 @@ class FlashRWAttention(torch.nn.Module):
     def __init__(
         self,
         config,
-        prefix,
+        prefix: str,
         weights,
     ):
         super().__init__()
@@ -158,6 +159,7 @@ class FlashRWAttention(torch.nn.Module):
             weights=weights,
             bias=config.bias,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
         )
@@ -180,7 +182,7 @@ class FlashRWAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         qkv = self.query_key_value(hidden_states)
@@ -198,35 +200,37 @@ class FlashRWAttention(torch.nn.Module):
         # Inplace rotary
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
-
-        # output
-        attn_output = torch.empty_like(query)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
@@ -236,7 +240,7 @@ class FlashRWLargeAttention(torch.nn.Module):
     def __init__(
         self,
         config,
-        prefix,
+        prefix: str,
         weights,
     ):
         super().__init__()
@@ -281,6 +285,7 @@ class FlashRWLargeAttention(torch.nn.Module):
             weights=weights,
             bias=config.bias,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
         )
@@ -298,7 +303,7 @@ class FlashRWLargeAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         qkv = self.query_key_value(hidden_states)
@@ -315,41 +320,37 @@ class FlashRWLargeAttention(torch.nn.Module):
         # Inplace rotary
         self.rotary_emb(query, torch.select(kv, dim=2, index=0), cos, sin)
 
-        reshape_and_cache(
-            kv[:, :, 0].contiguous(),
-            kv[:, :, 1].contiguous(),
-            kv_cache[0],
-            kv_cache[1],
-            slots,
+        kv_cache.store(
+            key=kv[:, :, 0].contiguous(),
+            value=kv[:, :, 1].contiguous(),
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
-        # output
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=2, index=0),
-                torch.select(kv, dim=2, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv[:, :, 0],
+                value=kv[:, :, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.dense(
@@ -358,7 +359,7 @@ class FlashRWLargeAttention(torch.nn.Module):
 
 
 class FlashMLP(nn.Module):
-    def __init__(self, config, prefix, weights):
+    def __init__(self, config, prefix: str, weights):
         super().__init__()
         self.act = torch.nn.functional.gelu
 
@@ -380,6 +381,7 @@ class FlashRWLayer(nn.Module):
     def __init__(
         self,
         layer_id,
+        prefix: str,
         config,
         weights,
     ):
@@ -388,10 +390,15 @@ class FlashRWLayer(nn.Module):
         parallel_attn = config.parallel_attn
         self.parallel_attn = parallel_attn
 
-        prefix = f"transformer.h.{layer_id}"
+        prefix = f"{prefix}.h.{layer_id}"
+
+        # NOTE: Falcon 180B uses the ln_attn prefix
+        ln_prefix = "input_layernorm"
+        if config.num_hidden_layers == 80:
+            ln_prefix = "ln_attn"
 
         self.input_layernorm = FastLayerNorm.load(
-            prefix=f"{prefix}.input_layernorm",
+            prefix=f"{prefix}.{ln_prefix}",
             weights=weights,
             eps=config.layer_norm_epsilon,
         )
@@ -428,7 +435,7 @@ class FlashRWLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         if self.parallel_attn:
@@ -442,7 +449,7 @@ class FlashRWLayer(nn.Module):
                 kv_cache,
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -464,7 +471,7 @@ class FlashRWLayer(nn.Module):
                 kv_cache,
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -479,9 +486,15 @@ class FlashRWLayer(nn.Module):
 
 
 class FlashRWLayerNorm(nn.Module):
-    def __init__(self, config, prefix, weights):
+    def __init__(self, config, prefix: str, weights):
         super().__init__()
-        self.num_ln = config.num_ln_in_parallel_attn
+        # Falcon2 includes the number of layer norms in the config
+        # in the case no number of layer norms is provided, we default to 1
+        self.num_ln = getattr(config, "num_ln_in_parallel_attn", 1)
+
+        # Falcon 180B uses the ln_attn prefix and has 2 layer norms
+        if config.num_hidden_layers == 80:
+            self.num_ln = 2
 
         if self.num_ln == 1:
             self.input_ln = FastLayerNorm.load(
@@ -518,9 +531,9 @@ class FlashRWLayerNorm(nn.Module):
 
 
 class FlashRWLargeLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, layer_id, prefix: str, config, weights):
         super().__init__()
-        prefix = f"transformer.h.{layer_id}"
+        prefix = f"{prefix}.h.{layer_id}"
 
         self.ln_layer = FlashRWLayerNorm(config, prefix, weights)
 
@@ -545,7 +558,7 @@ class FlashRWLargeLayer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         # Layer norm.
@@ -560,7 +573,7 @@ class FlashRWLargeLayer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
 
@@ -580,18 +593,18 @@ class FlashRWPreTrainedModel(PreTrainedModel):
 
 
 class FlashRWModel(FlashRWPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
         self.config = config
 
         self.word_embeddings = TensorParallelEmbedding(
-            prefix="transformer.word_embeddings", weights=weights
+            prefix=f"{prefix}.word_embeddings", weights=weights
         )
 
         if config.new_decoder_architecture:
             self.h = nn.ModuleList(
                 [
-                    FlashRWLargeLayer(layer_id, config, weights)
+                    FlashRWLargeLayer(layer_id, prefix, config, weights)
                     for layer_id in range(config.num_hidden_layers)
                 ]
             )
@@ -599,14 +612,14 @@ class FlashRWModel(FlashRWPreTrainedModel):
         else:
             self.h = nn.ModuleList(
                 [
-                    FlashRWLayer(layer_id, config, weights)
+                    FlashRWLayer(layer_id, prefix, config, weights)
                     for layer_id in range(config.num_hidden_layers)
                 ]
             )
             self.cache_size = self.h[0].self_attention.num_heads_kv
 
         self.ln_f = FastLayerNorm.load(
-            prefix="transformer.ln_f",
+            prefix=f"{prefix}.ln_f",
             weights=weights,
             eps=config.layer_norm_epsilon,
         )
@@ -621,7 +634,7 @@ class FlashRWModel(FlashRWPreTrainedModel):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
     ) -> torch.Tensor:
         hidden_states = self.word_embeddings(input_ids)
@@ -643,7 +656,7 @@ class FlashRWModel(FlashRWPreTrainedModel):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -653,10 +666,15 @@ class FlashRWModel(FlashRWPreTrainedModel):
 
 
 class FlashRWForCausalLM(FlashRWPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
 
-        self.transformer = FlashRWModel(config, weights)
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        self.transformer = FlashRWModel(prefix, config, weights)
 
         self.lm_head = SpeculativeHead.load(config, prefix="lm_head", weights=weights)
 
@@ -668,7 +686,7 @@ class FlashRWForCausalLM(FlashRWPreTrainedModel):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -681,7 +699,7 @@ class FlashRWForCausalLM(FlashRWPreTrainedModel):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
         if lm_head_indices is not None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 30989a375dbb1f76262b8b4685dc7f22a585e00b..ed053eb661c07b88df6a2c344c3e270587c40d08 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -8,7 +8,7 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -17,6 +17,8 @@ from text_generation_server.layers import (
     TensorParallelEmbedding,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
+from text_generation_server.layers.gptq import GPTQWeightsLoader
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
@@ -81,11 +83,13 @@ def _load_multi_mqa_gptq(
         qzeros = torch.cat([q_tensor, kv_tensor], dim=1)
         qzeros = qzeros.to(device=weights.device)
 
-        gptq_params = weights._get_gptq_params()
-        if gptq_params.quant_method == "gptq":
+        loader = weights.weights_loader
+        assert isinstance(loader, GPTQWeightsLoader)
+        loader._get_gptq_params(weights)
+        if loader.quant_method == "gptq":
             g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
             g_idx = g_idx.to(device=weights.device)
-        elif gptq_params.quant_method == "awq":
+        elif loader.quant_method == "awq":
             g_idx = None
             from text_generation_server.layers.awq.conversion_utils import (
                 fast_awq_to_gptq,
@@ -100,8 +104,9 @@ def _load_multi_mqa_gptq(
             qzeros=qzeros,
             scales=scales,
             g_idx=g_idx,
-            bits=gptq_params.bits,
-            groupsize=gptq_params.groupsize,
+            bits=loader.bits,
+            groupsize=loader.groupsize,
+            use_awq_kernel=loader.quantize == "awq",
             use_exllama=HAS_EXLLAMA,
         )
 
@@ -118,7 +123,7 @@ def _load_multi_mqa_gptq(
             bias = torch.cat([q_tensor, kv_tensor], dim=0)
             bias = bias.to(device=weights.device)
 
-        return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+        return TensorParallelColumnLinear(get_linear(weight, bias))
     else:
         raise NotImplementedError("Gptq loading with santacoder is not implemented")
 
@@ -190,29 +195,27 @@ def _load_multi_mqa(
         assert list(bias.shape) == [
             (num_heads + 2) * head_size
         ], f"{weight.shape} != {[(num_heads + 2) * head_size]}"
-    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias))
 
 
 def load_col(config, prefix: str, weights, bias: bool):
     if config.transpose:
         weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
     else:
-        weight = weights.get_multi_weights_col(
-            [prefix], quantize=config.quantize, dim=0
-        )
+        weight = weights.get_multi_weights_col([prefix], dim=0)
 
     if bias:
         bias = weights.get_sharded(f"{prefix}.bias", dim=0)
     else:
         bias = None
-    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+    return TensorParallelColumnLinear(get_linear(weight, bias))
 
 
 def load_row(config, prefix: str, weights, bias: bool):
     if config.transpose:
         weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
     else:
-        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+        weight = weights.get_weights_row(prefix)
 
     if bias and weights.process_group.rank() == 0:
         # Rank is only on the first rank process
@@ -220,7 +223,7 @@ def load_row(config, prefix: str, weights, bias: bool):
     else:
         bias = None
     return TensorParallelRowLinear(
-        get_linear(weight, bias, config.quantize), process_group=weights.process_group
+        get_linear(weight, bias), process_group=weights.process_group
     )
 
 
@@ -255,6 +258,7 @@ class FlashMQAttention(torch.nn.Module):
         self.c_proj = load_row(
             config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.kv_head_mapping = torch.zeros(
             self.num_heads, dtype=torch.int32, device=weights.device
         )
@@ -266,7 +270,7 @@ class FlashMQAttention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         qkv = self.c_attn(hidden_states)
@@ -280,37 +284,37 @@ class FlashMQAttention(torch.nn.Module):
         query = query.view(-1, self.num_heads, self.head_size)
         key_value = key_value.view(-1, 2, 1, self.head_size)
 
-        reshape_and_cache(
-            key_value[:, 0], key_value[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=key_value[:, 0],
+            value=key_value[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
-        # output
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(key_value, dim=1, index=0),
-                torch.select(key_value, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=key_value[:, 0],
+                value=key_value[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.c_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -346,16 +350,16 @@ class MLP(nn.Module):
 
 
 class Block(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, prefix: str, layer_id, config, weights):
         super().__init__()
-        prefix = f"transformer.h.{layer_id}"
+        prefix = f"{prefix}.h.{layer_id}"
         self.ln_1 = FastLayerNorm.load(
             prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
         )
         self.ln_2 = FastLayerNorm.load(
             prefix=f"{prefix}.ln_2", weights=weights, eps=config.layer_norm_epsilon
         )
-        self.attn = FlashMQAttention(
+        self.self_attn = FlashMQAttention(
             prefix=f"{prefix}.attn",
             config=config,
             weights=weights,
@@ -374,17 +378,17 @@ class Block(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
     ):
         hidden_states, residual = self.ln_1(hidden_states, residual)
-        hidden_states = self.attn(
+        hidden_states = self.self_attn(
             hidden_states,
             cu_seqlen_prefill,
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
 
@@ -396,25 +400,26 @@ class Block(nn.Module):
 
 
 class FlashSantacoderModel(nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
         self.config = config
 
         self.process_group = weights.process_group
         self.wte = TensorParallelEmbedding(
-            prefix="transformer.wte",
+            prefix=f"{prefix}.wte",
             weights=weights,
             reduce=False,
         )
         self.wpe = TensorParallelEmbedding(
-            prefix="transformer.wpe",
+            prefix=f"{prefix}.wpe",
             weights=weights,
             reduce=False,
         )
 
-        self.h = nn.ModuleList(
+        self.layers = nn.ModuleList(
             [
                 Block(
+                    prefix,
                     layer_id,
                     config,
                     weights,
@@ -426,8 +431,8 @@ class FlashSantacoderModel(nn.Module):
             prefix="transformer.ln_f", weights=weights, eps=config.layer_norm_epsilon
         )
 
-        self.head_size = self.h[0].attn.head_size
-        self.num_heads = self.h[0].attn.num_heads
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
 
     def forward(
         self,
@@ -437,7 +442,7 @@ class FlashSantacoderModel(nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
     ) -> torch.Tensor:
         hidden_states = self.wte(input_ids) + self.wpe(position_ids)
@@ -446,7 +451,7 @@ class FlashSantacoderModel(nn.Module):
             torch.distributed.all_reduce(hidden_states, group=self.process_group)
 
         residual = None
-        for i, layer in enumerate(self.h):
+        for i, layer in enumerate(self.layers):
             hidden_states, residual = layer(
                 hidden_states,
                 residual,
@@ -454,7 +459,7 @@ class FlashSantacoderModel(nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
             )
 
@@ -464,11 +469,18 @@ class FlashSantacoderModel(nn.Module):
 
 
 class FlashSantacoderForCausalLM(nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__()
-        self.transformer = FlashSantacoderModel(config, weights)
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        config.transpose = config.architectures[0].startswith("GPT2")
+        self.model = FlashSantacoderModel(prefix, config, weights)
         self.lm_head = SpeculativeHead.load(
-            config, prefix="transformer.wte", weights=weights
+            config, prefix=f"{prefix}.wte", weights=weights
         )
 
     def forward(
@@ -479,20 +491,20 @@ class FlashSantacoderForCausalLM(nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.transformer(
+        hidden_states = self.model(
             input_ids,
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
         )
         if lm_head_indices is not None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
index a0273c37fb6b13c5ed4468007a1ff6022fdc64d9..c793982d828f4b67fdeca1a03c082d877c8bb869 100644
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -29,7 +29,7 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
+    Seqlen,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -38,6 +38,7 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
     FastRMSNorm,
@@ -45,6 +46,7 @@ from text_generation_server.layers.layernorm import (
 from text_generation_server.layers.rotary import (
     PositionRotaryEmbedding,
 )
+from text_generation_server.utils.weights import UnquantizedWeight
 
 
 class Starcoder2Config(PretrainedConfig):
@@ -126,20 +128,19 @@ def _load_gqa(config, prefix: str, weights):
 
     weight = weights.get_multi_weights_col(
         prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
         dim=0,
     )
 
-    if config.quantize not in ["gptq", "awq", "marlin"]:
-        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
 
         head_size = config.hidden_size // config.num_attention_heads
         num_heads = config.num_attention_heads // weights.process_group.size()
         num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
-        assert list(weight.shape) == [
+        assert list(weight.weight.shape) == [
             (num_heads + 2 * num_key_value_heads) * head_size,
             config.hidden_size,
-        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
     if config.use_bias:
         w = [
@@ -150,9 +151,7 @@ def _load_gqa(config, prefix: str, weights):
     else:
         bias = None
 
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=bias, quantize=config.quantize)
-    )
+    return TensorParallelColumnLinear(get_linear(weight, bias=bias))
 
 
 class Starcoder2Attention(torch.nn.Module):
@@ -190,6 +189,7 @@ class Starcoder2Attention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -211,7 +211,7 @@ class Starcoder2Attention(torch.nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         prefill_cache_indices,
     ):
@@ -233,38 +233,38 @@ class Starcoder2Attention(torch.nn.Module):
         else:
             kv_to_cache = kv
 
-        reshape_and_cache(
-            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=kv_to_cache[:, 0],
+            value=kv_to_cache[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
-        # output tensor
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
-            attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = attention(
+                query=query,
+                key=kv_to_cache[:, 0],
+                value=kv_to_cache[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
-                attn_output,
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
-                input_lengths,
+                seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -386,7 +386,7 @@ class Starcoder2Layer(nn.Module):
         kv_cache,
         block_tables,
         slots,
-        input_lengths,
+        seqlen,
         max_s,
         prefill_cache_indices,
     ):
@@ -401,7 +401,7 @@ class Starcoder2Layer(nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             prefill_cache_indices,
         )
@@ -417,14 +417,14 @@ class Starcoder2Layer(nn.Module):
 
 
 class Starcoder2Model(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__()
 
         process_group = weights.process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
         self.embed_tokens = TensorParallelEmbedding(
-            prefix="model.embed_tokens", weights=weights
+            prefix=f"{prefix}.embed_tokens", weights=weights
         )
         self.layers = nn.ModuleList(
             [
@@ -437,7 +437,7 @@ class Starcoder2Model(torch.nn.Module):
             ]
         )
         self.norm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
-            prefix="model.norm", weights=weights, eps=config.norm_epsilon
+            prefix=f"{prefix}.norm", weights=weights, eps=config.norm_epsilon
         )
 
         self.gradient_checkpointing = False
@@ -454,7 +454,7 @@ class Starcoder2Model(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         true_max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
@@ -478,7 +478,7 @@ class Starcoder2Model(torch.nn.Module):
                 kv_cache[i],
                 block_tables,
                 slots,
-                input_lengths,
+                seqlen,
                 max_s,
                 prefill_cache_indices,
             )
@@ -489,10 +489,15 @@ class Starcoder2Model(torch.nn.Module):
 
 
 class FlashStarcoder2ForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__()
 
-        self.model = Starcoder2Model(config, weights)
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = Starcoder2Model(prefix, config, weights)
         try:
             self.lm_head = SpeculativeHead.load(
                 config,
@@ -502,7 +507,7 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):
         except RuntimeError:
             self.lm_head = SpeculativeHead.load(
                 config,
-                prefix="model.embed_tokens",
+                prefix=f"{prefix}.embed_tokens",
                 weights=weights,
             )
 
@@ -521,7 +526,7 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -534,7 +539,7 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):
         elif self.max_past is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
+            seqlen = seqlen.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,
@@ -543,7 +548,7 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):
             kv_cache,
             block_tables,
             slots,
-            input_lengths,
+            seqlen,
             max_s,
             true_max_s,
             prefill_cache_indices,
diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py
index a83bc1c648b2dc3233b881533fd83e3e8a0e4388..a829c374128cc347cff764339e454fc3b1c2d5d3 100644
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ PyTorch Idefics2 model."""
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
@@ -22,11 +22,10 @@ from torch import nn
 import math
 
 from transformers.activations import ACT2FN
-from transformers.image_processing_utils import select_best_resolution
 from text_generation_server.models.custom_modeling.vlm import (
     load_text_model,
-    load_vision_model,
 )
+from text_generation_server.layers.attention import Seqlen
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 
 from text_generation_server.layers import (
@@ -34,6 +33,7 @@ from text_generation_server.layers import (
     TensorParallelEmbedding,
     TensorParallelRowLinear,
 )
+from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight
 
 
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -682,7 +682,7 @@ class Idefics2Connector(nn.Module):
 class Idefics2ForConditionalGeneration(nn.Module):
     def __init__(self, prefix, config, weights):
         super().__init__()
-        config.vision_config.quantize = config.quantize
+        config.vision_config.quantize = None
         config.vision_config.speculator = config.speculator
         config.text_config.quantize = config.quantize
         config.text_config.speculator = config.speculator
@@ -695,16 +695,24 @@ class Idefics2ForConditionalGeneration(nn.Module):
             name="text_model",
         )
         self.dtype = weights.dtype
-        self.vision_model = Idefics2VisionTransformer(
-            prefix=f"{prefix}.model.vision_model" if prefix else "model.vision_model",
-            config=vision_config,
-            weights=weights,
-        )
-        self.connector = Idefics2Connector(
-            prefix=f"{prefix}.model.connector" if prefix else "model.connector",
-            config=config,
-            weights=weights,
-        )
+
+        # The vision and connector models are not quantized.
+        with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)):
+            self.vision_model = Idefics2VisionTransformer(
+                prefix=(
+                    f"{prefix}.model.vision_model" if prefix else "model.vision_model"
+                ),
+                config=vision_config,
+                weights=weights,
+            )
+
+            config.quantize = None
+            self.connector = Idefics2Connector(
+                prefix=f"{prefix}.model.connector" if prefix else "model.connector",
+                config=config,
+                weights=weights,
+            )
+
         self.config = config
         self.image_seq_len = config.perceiver_config.resampler_n_latents
         self.image_token_id = config.image_token_id
@@ -733,7 +741,7 @@ class Idefics2ForConditionalGeneration(nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -819,10 +827,11 @@ class Idefics2ForConditionalGeneration(nn.Module):
             kv_cache=kv_cache,
             block_tables=block_tables,
             slots=slots,
-            input_lengths=input_lengths,
+            seqlen=seqlen,
             max_s=max_s,
             true_max_s=max_s,
             prefill_cache_indices=None,
+            adapter_data=adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
index e323d365cf4786c549a67574d60a82484ebd25f4..afb8e1f9ccb5ff3eb5aa2d44d87e2cbfa9b44b47 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
@@ -19,6 +19,7 @@ import numpy as np
 
 from PIL import Image
 
+import transformers
 from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
 from transformers.image_transforms import (
     resize,
@@ -293,6 +294,4 @@ class IdeficsImageProcessor(BaseImageProcessor):
         return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
 
-import transformers
-
 transformers.IdeficsImageProcessor = IdeficsImageProcessor
diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index 51b52594f3bd733e03e84f128e3b354896745731..4159892da3e5844bf7d083b2a8b9b80bc39f8e0d 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -21,10 +21,8 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
 
 from transformers import PreTrainedModel
 from transformers.activations import ACT2FN
@@ -33,13 +31,6 @@ from transformers.modeling_outputs import (
     CausalLMOutputWithPast,
     dataclass,
 )
-from transformers.modeling_utils import PretrainedConfig
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
 from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
 from text_generation_server.models.custom_modeling.idefics_vision import (
     IdeficsVisionTransformer,
@@ -56,12 +47,13 @@ from text_generation_server.layers import (
 )
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.utils.import_utils import SYSTEM
+from loguru import logger
 
 if SYSTEM == "cuda":
     import dropout_layer_norm
 elif SYSTEM == "rocm":
     # from vllm._C import ops
-    from vllm import _custom_ops
+    import vllm._custom_ops as ops
 else:
     dropout_layer_norm = None
 
@@ -238,7 +230,7 @@ class IdeficsDecoupledPartialTPEmbedding(nn.Module):
             prefix="model.embed_tokens", weights=weights
         )
         self.additional_weight = nn.Parameter(
-            weights.get_tensor(f"model.embed_tokens.additional_embedding.weight")
+            weights.get_tensor("model.embed_tokens.additional_embedding.weight")
         )
 
     def forward(self, input_ids):
@@ -360,7 +352,19 @@ class IdeficsRMSNorm(nn.Module):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states, residual=None):
-        if hidden_states.shape[-1] > 8192:
+        if SYSTEM == "ipex":
+            import intel_extension_for_pytorch as ipex
+
+            out = ipex.llm.functional.add_rms_norm(
+                residual,
+                hidden_states,
+                self.weight,
+                None,
+                self.variance_epsilon,
+                residual is not None,
+            )
+            return out
+        elif hidden_states.shape[-1] > 8192:
             if residual is not None:
                 hidden_states += residual
             residual = hidden_states
@@ -421,7 +425,7 @@ class IdeficsRMSNorm(nn.Module):
                 hidden_states = hidden_states.reshape(-1, shape[-1])
 
             out = torch.empty_like(hidden_states)
-            _custom_ops.rms_norm(
+            ops.rms_norm(
                 out,
                 hidden_states,
                 self.weight.data,
@@ -500,7 +504,6 @@ class IdeficsAttention(nn.Module):
         # if not hasattr(nn.functional, "scaled_dot_product_attention"):
         #     raise ValueError("this model requires pytorch 2.0 or higher")
 
-        process_group = weights.process_group
         if self.num_heads % weights.process_group.size() != 0:
             raise ValueError(
                 f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
@@ -1025,7 +1028,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
         if config.use_resampler:
             perceiver_config = config.perceiver_config
             self.perceiver_resampler = IdeficsPerceiverResampler(
-                prefix=f"model.perceiver_resampler",
+                prefix="model.perceiver_resampler",
                 config=config,
                 embed_dim=config.vision_config.embed_dim,
                 depth=perceiver_config.resampler_depth,
@@ -1053,7 +1056,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
         # self.gradient_checkpointing = False
 
         self.norm = IdeficsRMSNorm(
-            prefix=f"model.norm", weights=weights, eps=config.rms_norm_eps
+            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
         )
 
         # self.gradient_checkpointing = False
diff --git a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
index af44490b39f31ecd43bcd9b5e9a6ac7addfb5ba8..6da8045bcba214164264af694b3070d6972b6ee4 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
@@ -169,7 +169,6 @@ class IdeficsPerceiverAttention(nn.Module):
 
         self.qk_scale = self.head_dim**-0.5
 
-        process_group = weights.process_group
         if n_heads % weights.process_group.size() != 0:
             raise ValueError(
                 f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {n_heads} "
diff --git a/server/text_generation_server/models/custom_modeling/idefics_processing.py b/server/text_generation_server/models/custom_modeling/idefics_processing.py
index 7bba6977e7873d3f9bc2700c0e2314e85ad86cd3..ca61e27d440eecdc584610549a7aa65cf2cd9434 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_processing.py
@@ -28,9 +28,6 @@ from transformers.tokenization_utils_base import (
     TruncationStrategy,
 )
 from transformers.utils import TensorType, is_torch_available
-from text_generation_server.models.custom_modeling.idefics_image_processing import (
-    IdeficsImageProcessor,
-)
 
 
 if is_torch_available():
diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py
index 30c5997fe5763da4d3345915c390d861104d9236..dd8f76bc49a76a08137f26192f6148d4b9ca08b2 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -129,7 +129,6 @@ class IdeficsVisionAttention(nn.Module):
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
 
-        process_group = weights.process_group
         if self.num_heads % weights.process_group.size() != 0:
             raise ValueError(
                 f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
@@ -460,7 +459,6 @@ class IdeficsVisionTransformer(nn.Module):
     def __init__(self, prefix, config, weights):
         super().__init__()
         self.config = config
-        embed_dim = config.hidden_size
 
         self.embeddings = IdeficsVisionEmbeddings(
             prefix=f"{prefix}.embeddings", config=config, weights=weights
diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py
index 6d38442cc02c5a3ded6681cb54427127f1537d2c..32e9d3348b3ed20f31066849d3911e6c79ad5edd 100644
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ PyTorch Llava-NeXT model."""
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
@@ -23,6 +23,7 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.image_processing_utils import select_best_resolution
 
+from text_generation_server.layers.attention import Seqlen
 from text_generation_server.models.custom_modeling.vlm import (
     load_text_model,
     load_vision_model,
@@ -136,7 +137,7 @@ class LlavaNextForConditionalGeneration(nn.Module):
         self.config = config
         config.text_config.quantize = config.quantize
         config.text_config.speculator = config.speculator
-        self.language_model = load_text_model(
+        self.text_model = load_text_model(
             prefix="language_model" if not prefix else f"{prefix}.language_model",
             config=config.text_config,
             weights=weights,
@@ -170,7 +171,7 @@ class LlavaNextForConditionalGeneration(nn.Module):
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
         block_tables: torch.Tensor,
         slots: torch.Tensor,
-        input_lengths: torch.Tensor,
+        seqlen: Seqlen,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -180,7 +181,7 @@ class LlavaNextForConditionalGeneration(nn.Module):
         image_sizes: Optional[torch.LongTensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ):
-        inputs_embeds = self.language_model.embed_tokens(input_ids)
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
         if pixel_values is not None and len(pixel_values) > 0:
             # num_special_image_tokens = (input_ids == self.config.image_token_index).sum()
             # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid"
@@ -269,19 +270,20 @@ class LlavaNextForConditionalGeneration(nn.Module):
                 input_ids, inputs_embeds, image_features
             )
 
-        hidden_states = self.language_model.model(
+        hidden_states = self.text_model.model(
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
             cu_seqlen_prefill=cu_seqlen_prefill,
             kv_cache=kv_cache,
             block_tables=block_tables,
             slots=slots,
-            input_lengths=input_lengths,
+            seqlen=seqlen,
             max_s=max_s,
             true_max_s=max_s,
             prefill_cache_indices=None,
+            adapter_data=adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
-        logits, speculative_logits = self.language_model.lm_head(hidden_states)
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
         return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/mllama.py b/server/text_generation_server/models/custom_modeling/mllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..be0a4b5d7c3dab43c7752c3ac551f86c18f35361
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/mllama.py
@@ -0,0 +1,1036 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Mllama model."""
+
+from typing import Optional, Tuple, List
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+else:
+    import flash_attn_2_cuda
+
+from transformers.activations import ACT2FN
+import torch.nn.functional as F
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    FastLinear,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+)
+from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+    FlashLlamaForCausalLM,
+)
+
+
+def _prepare_aspect_ratio_attention_mask(
+    aspect_ratio_mask: torch.Tensor,
+    num_patches: int,
+    target_length: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    # Expand aspect ratio mask to target_length
+    batch_size, max_num_tiles = aspect_ratio_mask.shape
+    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype)
+    attention_mask = attention_mask.repeat(1, 1, target_length, 1)
+
+    # Mask padding patches
+    pad_patches = target_length - num_patches
+    attention_mask[:, :, -pad_patches:] = 0
+
+    # Invert the mask (0 -> 1, 1 -> 0)
+    attention_mask = 1 - attention_mask
+
+    # Reshape to 2D and create 4D attention mask
+    # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length)
+    attention_mask = attention_mask.reshape(
+        batch_size, max_num_tiles * target_length, 1
+    )
+    attention_mask = (
+        attention_mask @ attention_mask.transpose(-1, -2) * torch.finfo(dtype).min
+    )
+    attention_mask = attention_mask.unsqueeze(1)
+
+    return attention_mask
+
+
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full(
+            (sequence_length, target_length),
+            fill_value=min_dtype,
+            dtype=dtype,
+            device=device,
+        )
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(
+            target_length, device=device
+        ) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = (
+                causal_mask.clone()
+            )  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = (
+                causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            )
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[
+                :, :, :, :mask_length
+            ].masked_fill(padding_mask, min_dtype)
+
+    return causal_mask
+
+
+def _prepare_cross_attention_mask(
+    cross_attention_mask: torch.Tensor,
+    num_vision_tokens: int,
+    dtype: str,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # reshape so it can be used by attn module
+    batch_size, text_total_length, *_ = cross_attention_mask.shape
+    cross_attention_mask = cross_attention_mask.repeat_interleave(
+        num_vision_tokens, dim=3
+    )
+    cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
+    cross_attention_mask = cross_attention_mask.unsqueeze(1)
+
+    # invert the mask
+    inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
+    cross_attention_mask = inverted_cross_attn_mask.masked_fill(
+        inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+    # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
+    # last dimension contains negative infinity values, otherwise it's 1
+    negative_inf_value = torch.finfo(dtype).min
+    full_text_row_masked_out_mask = (
+        (cross_attention_mask != negative_inf_value)
+        .any(dim=-1)
+        .type_as(cross_attention_mask)[..., None]
+    )
+    cross_attention_mask *= full_text_row_masked_out_mask
+
+    return cross_attention_mask, full_text_row_masked_out_mask
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->MllamaVision
+class MllamaVisionMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MllamaVisionSdpaAttention(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+
+        self.embed_dim = config.hidden_size
+        self.head_dim = config.hidden_size // config.attention_heads
+        self.num_heads = config.attention_heads // weights.process_group.size()
+
+        self.qkv_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qkv = self.qkv_proj(hidden_state)
+        query, key, value = qkv.split(
+            [
+                self.head_dim * self.num_heads,
+                self.head_dim * self.num_heads,
+                self.head_dim * self.num_heads,
+            ],
+            dim=2,
+        )
+
+        batch_size, q_seq_len, _ = query.shape
+        _, kv_seq_len, _ = key.shape
+
+        query = query.view(batch_size, q_seq_len, self.num_heads, self.head_dim)
+        key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
+        value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        attn_output = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_seq_len, -1)
+
+        output = self.o_proj(attn_output)
+        return output
+
+
+class MllamaVisionEncoderLayer(nn.Module):
+    def __init__(self, *, prefix, config, weights, is_gated: bool):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.attention_heads
+        self.is_gated = is_gated
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = MllamaVisionSdpaAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = MllamaVisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+        self.input_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=1e-05
+        )
+        self.post_attention_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=1e-05
+        )
+
+        # there used to be an if else here, no code path
+        if is_gated:
+            self.gate_attn = nn.Parameter(
+                weights.get_tensor(f"{prefix}.gate_attn"), requires_grad=False
+            )
+            self.gate_ffn = nn.Parameter(
+                weights.get_tensor(f"{prefix}.gate_ffn"), requires_grad=False
+            )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask)
+        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
+        hidden_state = residual + gate_attn * hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
+        hidden_state = residual + gate_ffn * hidden_state
+        return hidden_state
+
+
+class MllamaVisionEncoder(nn.Module):
+    def __init__(self, *, prefix, config, weights, is_gated: bool, num_layers: int):
+        super().__init__()
+        self.config = config
+        self.layers = [
+            MllamaVisionEncoderLayer(
+                prefix=f"{prefix}.layers.{i}",
+                config=config,
+                weights=weights,
+                is_gated=is_gated,
+            )
+            for i in range(num_layers)
+        ]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        encoder_states = [hidden_states]
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+
+            hidden_states = layer_outputs
+            encoder_states.append(hidden_states)
+
+        return hidden_states, encoder_states
+
+
+class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+
+        self.embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.embedding", weights=weights
+        )
+        self.gate = nn.Parameter(
+            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
+        )
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        embeddings = self.embedding(aspect_ratio_ids)
+        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)
+
+        # Always gated.
+        embeddings = embeddings * self.gate.tanh()
+
+        hidden_state = hidden_state + embeddings
+        return hidden_state
+
+
+class MllamaPrecomputedPositionEmbedding(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
+        self.hidden_size = config.hidden_size
+        self.scale = config.hidden_size**-0.5
+
+        self.gate = nn.Parameter(
+            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
+        )
+
+        # position embedding
+        embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.embedding"), requires_grad=False
+        )
+        self.gated_position_embedding = (1 - self.gate.tanh()) * embedding
+        self.tile_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.tile_embedding", weights=weights
+        )
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        # position embeddings
+        hidden_state = hidden_state + self.gated_position_embedding.view(
+            1, 1, self.num_patches, self.hidden_size
+        )
+
+        # precomputed tile position embeddings
+        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
+        batch_size = hidden_state.shape[0]
+        tile_position_embedding = tile_position_embedding.reshape(
+            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size
+        )
+        gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding
+        hidden_state = hidden_state + gated_tile_position_embedding
+
+        return hidden_state
+
+
+class MllamaVisionModel(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+        self.intermediate_layers_indices = config.intermediate_layers_indices
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
+        self.scale = config.hidden_size**-0.5
+        self.dtype = weights.dtype
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+            bias=False,
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+
+        self.class_embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.class_embedding"), requires_grad=False
+        )
+
+        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
+            prefix=f"{prefix}.gated_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+
+        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            prefix=f"{prefix}.pre_tile_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            prefix=f"{prefix}.post_tile_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+
+        ## layer norms
+        self.layernorm_pre = nn.LayerNorm.load(
+            prefix=f"{prefix}.layernorm_pre",
+            weights=weights,
+            # torch default
+            eps=1e-05,
+        )
+        self.layernorm_post = nn.LayerNorm.load(
+            prefix=f"{prefix}.layernorm_post",
+            weights=weights,
+            # torch default
+            eps=1e-05,
+        )
+
+        ## encoders
+        self.transformer = MllamaVisionEncoder(
+            prefix=f"{prefix}.transformer",
+            config=config,
+            weights=weights,
+            is_gated=False,
+            num_layers=config.num_hidden_layers,
+        )
+        self.global_transformer = MllamaVisionEncoder(
+            prefix=f"{prefix}.global_transformer",
+            config=config,
+            weights=weights,
+            is_gated=True,
+            num_layers=config.num_global_layers,
+        )
+
+    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        aspect_ratio_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        (
+            batch_size,
+            num_concurrent_media,
+            num_tiles,
+            num_channels,
+            height,
+            width,
+        ) = pixel_values.shape
+
+        pixel_values = pixel_values.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_channels, height, width
+        )
+        aspect_ratio_ids = aspect_ratio_ids.reshape(
+            batch_size * num_concurrent_media, -1
+        )
+
+        # patch embedding
+        patch_embeds = self.patch_embedding(pixel_values)
+        hidden_state = patch_embeds.flatten(2).transpose(1, 2)
+
+        # tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, -1, dim
+        )
+        hidden_state = self.pre_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+
+        # apply cls token
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_patches, dim
+        )
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # apply position embeddings
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, num_patches, dim
+        )
+        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)
+
+        # apply encoder
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0,
+            0,
+            0,
+            num_padding_patches,
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(
+                batch_size * num_concurrent_media, -1
+            )
+            attention_mask = _prepare_aspect_ratio_attention_mask(
+                aspect_ratio_mask=attention_mask,
+                num_patches=self.num_patches,
+                target_length=hidden_state.shape[2],
+                dtype=self.dtype,
+            )
+
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
+        hidden_state, all_intermediate_hidden_states = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+        )
+        intermediate_hidden_states = [
+            hidden_state
+            for idx, hidden_state in enumerate(all_intermediate_hidden_states)
+            if idx in self.intermediate_layers_indices
+        ]
+        intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1)
+
+        # apply global encoder
+        hidden_state = self.layernorm_post(hidden_state)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = self.post_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles * (num_patches + num_padding_patches),
+            dim,
+        )
+        hidden_state, _ = self.global_transformer(
+            hidden_state, attention_mask=attention_mask
+        )
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = hidden_state[:, :, :slice_index]
+
+        # adding intermediate layer outputs
+        hidden_state = hidden_state.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, dim
+        )
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            -1,
+        )
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1
+        )
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
+        return hidden_state
+
+
+class MllamaTextCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, *, prefix, config, weights, layer_idx):
+        super().__init__()
+        self.config = config
+        self.num_heads = self.config.num_attention_heads
+        self.num_key_value_heads = self.config.num_key_value_heads
+        self.dropout = config.dropout
+        self.hidden_size = config.hidden_size
+        self.head_size = config.hidden_size // self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.layer_idx = layer_idx
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            self.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.q_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.q_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.k_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.k_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.v_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.v_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.q_norm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.k_norm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.softmax_scale = self.head_size**-0.5
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        # past_key_value=None,
+        # attention_mask: Optional[torch.Tensor] = None,
+        # cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # hidden_states = hidden_states.unsqueeze(0)
+        # bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.view(-1, self.num_heads, self.head_size)
+        query_states = self.q_norm(query_states)
+
+        (
+            cross_attention_states,
+            cu_seqlen_q,
+            cu_seqlen_k,
+            max_q,
+            max_k,
+            indices,
+        ) = cross_attention_states
+
+        key_states = self.k_proj(cross_attention_states)
+        value_states = self.v_proj(cross_attention_states)
+        key_states = key_states.view(-1, self.num_key_value_heads, self.head_size)
+        value_states = value_states.view(-1, self.num_key_value_heads, self.head_size)
+        key_states = self.k_norm(key_states)
+
+        # key_states = key_states.repeat(1, self.num_key_value_groups, 1)
+        # value_states = value_states.repeat(1, self.num_key_value_groups, 1)
+
+        causal = False
+        # logger.info(
+        #     f"Q: {query_states.shape} -K {key_states.shape} - V{value_states.shape}"
+        # )
+        if SYSTEM == "ipex":
+            attn_output = torch.empty_like(query_states)
+            ipex.llm.functional.varlen_attention(
+                (
+                    query_states.contiguous()
+                    if query_states.device.type == "xpu"
+                    else query_states
+                ),
+                (
+                    key_states.contiguous()
+                    if key_states.device.type == "xpu"
+                    else key_states
+                ),
+                (
+                    value_states.contiguous()
+                    if value_states.device.type == "xpu"
+                    else value_states
+                ),
+                attn_output,
+                cu_seqlen_q,
+                cu_seqlen_k,
+                max_q,
+                max_k,
+                0.0,
+                self.softmax_scale,
+                False,
+                causal,
+                False,
+                None,
+            )
+        else:
+            attn_output = flash_attn_2_cuda.varlen_fwd(
+                query_states,
+                key_states,
+                value_states,
+                None,
+                cu_seqlen_q,
+                cu_seqlen_k,
+                None,
+                None,
+                None,  # block_tables
+                None,
+                max_q,
+                max_k,
+                0.0,
+                self.softmax_scale,
+                False,
+                causal,  # Causal
+                -1,  # window_size_left,
+                -1,
+                0.0,  # softcap
+                False,
+                None,
+            )[0]
+        attn_output = self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+        return attn_output
+
+
+# Copied from transformers.models.gemma2.modeling_gemma2.Gemma2MLP with Gemma2->MllamaText
+class MllamaTextMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        shape = x.shape
+        gate_up_states = self.gate_up_proj(x)
+        gate_up_states = gate_up_states.view(*shape[:-1], 2, self.intermediate_size)
+        result = self.down_proj(
+            self.act_fn(gate_up_states[:, 0]) * gate_up_states[:, 1]
+        )
+        return result
+
+
+class FlashLlamaCrossLayer(torch.nn.Module):
+    """Cross-attention transformer block with tanh-gated attention and feedforward."""
+
+    def __init__(self, *, prefix, config, weights, index) -> None:
+        layer_idx = index
+        super().__init__()
+        self.cross_attn = MllamaTextCrossAttention(
+            prefix=f"{prefix}.cross_attn",
+            config=config,
+            weights=weights,
+            layer_idx=layer_idx,
+        )
+
+        self.input_layernorm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.cross_attn_attn_gate = torch.nn.Parameter(
+            weights.get_tensor(f"{prefix}.cross_attn_attn_gate"), requires_grad=False
+        )
+
+        self.mlp = MllamaTextMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.post_attention_layernorm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.cross_attn_mlp_gate = torch.nn.Parameter(
+            weights.get_tensor(f"{prefix}.cross_attn_mlp_gate"), requires_grad=False
+        )
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        adapter_data,
+        cross_attention_states,  # [ IB, ...]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cross_attention_states is None:
+            return hidden_states, residual
+        if residual is not None:
+            hidden_states += residual
+
+        indices = cross_attention_states[-1]
+        out_hidden_states = hidden_states[:]
+        if len(indices) > 0:
+            assert max(indices) < hidden_states.shape[0]
+        hidden_states = hidden_states[indices]
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.cross_attn(
+            hidden_states=hidden_states,
+            # attention_mask=cross_attention_mask,
+            cross_attention_states=cross_attention_states,
+        )
+        hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
+
+        out_hidden_states[indices] = hidden_states
+        hidden_states = out_hidden_states
+
+        return hidden_states, None
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MllamaText
+class MllamaTextRMSNorm(nn.Module):
+    def __init__(self, weight, eps):
+        super().__init__()
+        self.weight = weight
+        self.variance_epsilon = eps
+
+    @classmethod
+    def load(cls, *, prefix, weights, eps):
+        weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
+        )
+        return cls(weight=weight, eps=eps)
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class MllamaForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+        config.text_config._attn_implementation = "sdpa"
+        self.hidden_size = config.text_config.hidden_size
+        self.vision_model = MllamaVisionModel(
+            prefix="vision_model", config=config.vision_config, weights=weights
+        )
+        self.multi_modal_projector = FastLinear.load(
+            prefix="multi_modal_projector", config=config, weights=weights, bias=True
+        )
+        self.text_model = FlashLlamaForCausalLM(
+            prefix="language_model", config=config.text_config, weights=weights
+        )
+        self.config = config
+        self.dtype = weights.dtype
+        self.device = weights.device
+
+    def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
+        if aspect_ratio_ids is None:
+            raise ValueError(
+                "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
+            )
+        # logger.info(f"PIxel values {pixel_values.shape}")
+        batch_size = pixel_values.shape[0]
+        vision_states = self.vision_model(
+            pixel_values, aspect_ratio_ids, aspect_ratio_mask
+        )
+        cross_attention_states = self.multi_modal_projector(vision_states).reshape(
+            -1, vision_states.shape[-2], self.hidden_size
+        )
+        _, _, h = cross_attention_states.shape
+        cross_attention_states = cross_attention_states.view(batch_size, -1, h)
+        # logger.info(f"cross {cross_attention_states.shape}")
+        return cross_attention_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor],
+        adapter_data: Optional[torch.Tensor] = None,
+        # XXX: Putting these as optional so that the cuda warmup calls can go through.
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
+    ):
+        if cross_attention_states is not None:
+            seqlen_q = len(image_indices)
+            n_images = cross_attention_states.shape[0]
+            seqlen_k = cross_attention_states.shape[1]
+            device = cross_attention_states.device
+            if cu_seqlen_prefill is not None:
+                offset = 0
+                cu_q = []
+                indices = []
+                for index in image_indices:
+                    cu_q.append(offset)
+                    length = seqlen.input_lengths[index].item()
+                    assert index < seqlen.cu_seqlen_q.shape[0]
+                    input_ids_offset = seqlen.cu_seqlen_q[index]
+                    indices.extend(range(input_ids_offset, input_ids_offset + length))
+                    offset += length
+                cu_q.append(offset)
+                cu_seqlen_q = torch.Tensor(cu_q).to(device=device, dtype=torch.int32)
+
+                assert max(indices) < input_ids.shape[0]
+
+                cu_seqlen_k = (
+                    torch.arange(
+                        n_images + 1,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    * seqlen_k
+                )
+                max_q = cu_seqlen_q[-1].item()
+                max_k = seqlen_k
+            else:
+                cu_seqlen_q = torch.arange(
+                    seqlen_q + 1, device=device, dtype=torch.int32
+                )
+                seqlen_k = cross_attention_states.shape[1]
+                n_images = cross_attention_states.shape[0]
+                cu_seqlen_k = (
+                    torch.arange(
+                        n_images + 1,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    * seqlen_k
+                )
+                max_q = seqlen_q
+                max_k = seqlen_k
+                indices = image_indices[:]
+
+            cross_attention_states = (
+                cross_attention_states,
+                cu_seqlen_q,
+                cu_seqlen_k,
+                max_q,
+                max_k,
+                indices,
+            )
+
+        outputs = self.text_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            prefill_cache_indices=prefill_cache_indices,
+            lm_head_indices=lm_head_indices,
+            adapter_data=adapter_data,
+            cross_attention_states=cross_attention_states,
+        )
+
+        return outputs
diff --git a/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
index f7981bf531183c3c56248d3ba7863512e796605e..988a74a3979db84b156aa27147dea5afe2ff0637 100644
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@@ -4,7 +4,6 @@ Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
 """
 
 import math
-import os
 import warnings
 from typing import List, Optional, Tuple, Union
 import torch
@@ -75,7 +74,7 @@ def load_col(config, prefix, weights, bias):
         bias = bias.to(device=weights.device)
     else:
         bias = None
-    linear = get_linear(weight, bias, config.quantize)
+    linear = get_linear(weight, bias)
     return TensorParallelColumnLinear(linear)
 
 
@@ -194,7 +193,7 @@ def flash_attn_fn(
 ):
     try:
         from flash_attn import bert_padding, flash_attn_interface
-    except:
+    except Exception:
         raise RuntimeError("Please install flash-attn==1.0.3.post0")
     check_valid_inputs(query, key, value)
     if past_key_value is not None:
@@ -207,7 +206,7 @@ def flash_attn_fn(
         _s_k = max(0, attn_bias.size(3) - key.size(1))
         attn_bias = attn_bias[:, :, _s_q:, _s_k:]
     if attn_bias is not None:
-        raise NotImplementedError(f"attn_bias not implemented for flash attn.")
+        raise NotImplementedError("attn_bias not implemented for flash attn.")
     (batch_size, seqlen) = query.shape[:2]
     if key_padding_mask is None:
         key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
@@ -269,13 +268,13 @@ def triton_flash_attn_fn(
 ):
     try:
         from .flash_attn_triton import flash_attn_func
-    except:
+    except Exception:
         _installed = False
         if version.parse(torch.__version__) < version.parse("2.0.0"):
             _installed = True
             try:
                 from flash_attn.flash_attn_triton import flash_attn_func
-            except:
+            except Exception:
                 _installed = False
         if not _installed:
             raise RuntimeError(
@@ -292,9 +291,9 @@ def triton_flash_attn_fn(
         _s_k = max(0, attn_bias.size(3) - key.size(1))
         attn_bias = attn_bias[:, :, _s_q:, _s_k:]
     if dropout_p:
-        raise NotImplementedError(f"Dropout not implemented for attn_impl: triton.")
+        raise NotImplementedError("Dropout not implemented for attn_impl: triton.")
     if needs_weights:
-        raise NotImplementedError(f"attn_impl: triton cannot return attn weights.")
+        raise NotImplementedError("attn_impl: triton cannot return attn weights.")
     if key_padding_mask is not None:
         warnings.warn(
             "Propagating key_padding_mask to the attention module "
@@ -337,17 +336,17 @@ class MultiheadAttention(nn.Module):
         weights,
     ):
         super().__init__()
-        attn_impl = config.attn_config["attn_impl"]
-        self.attn_impl = config.attn_config["attn_impl"]
-        self.clip_qkv = config.attn_config["clip_qkv"]
-        self.qk_ln = config.attn_config["qk_ln"]
+        attn_impl = config.attn_config.attn_impl
+        self.attn_impl = config.attn_config.attn_impl
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.qk_ln = config.attn_config.qk_ln
         self.d_model = config.d_model
         d_model = config.d_model
         self.n_heads = config.n_heads
-        self.softmax_scale = config.attn_config["softmax_scale"]
+        self.softmax_scale = config.attn_config.softmax_scale
         if self.softmax_scale is None:
             self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
-        self.attn_dropout_p = config.attn_config["attn_pdrop"]
+        self.attn_dropout_p = config.attn_config.attn_pdrop
 
         if self.n_heads % weights.process_group.size() != 0:
             raise ValueError(
@@ -428,24 +427,24 @@ class MultiQueryAttention(nn.Module):
     additive bias.
     """
 
-    def __init__(self, config, prefix, weights):
+    def __init__(self, config, prefix, weights, verbose=False):
         super().__init__()
-        attn_impl = config.attn_config["attn_impl"]
-        self.attn_impl = config.attn_config["attn_impl"]
-        self.clip_qkv = config.attn_config["clip_qkv"]
-        self.qk_ln = config.attn_config["qk_ln"]
+        attn_impl = config.attn_config.attn_impl
+        self.attn_impl = config.attn_config.attn_impl
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.qk_ln = config.attn_config.qk_ln
         self.d_model = config.d_model
         d_model = config.d_model
         self.n_heads = config.n_heads
-        self.softmax_scale = config.attn_config["softmax_scale"]
+        self.softmax_scale = config.attn_config.softmax_scale
         if self.softmax_scale is None:
             self.softmax_scale = 1 / math.sqrt(self.head_dim)
-        self.attn_dropout_p = config.attn_config["attn_pdrop"]
+        self.attn_dropout_p = config.attn_config.attn_pdrop
         # self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
         self.Wqkv = TensorParallelColumnLinear.load(
             config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
         )
-        fuse_splits = (d_model, d_model + self.head_dim)
+        (d_model, d_model + self.head_dim)
         if self.qk_ln:
             raise NotImplementedError("qk_ln not supported")
         if self.attn_impl == "flash":
@@ -614,9 +613,9 @@ class MPTBlock(nn.Module):
     def __init__(self, config, prefix, weights):
         super().__init__()
         self.prefix = prefix
-        if config.attn_config["attn_type"] != "multihead_attention":
+        if config.attn_config.attn_type != "multihead_attention":
             raise NotImplementedError(
-                f"""Not implemented attn {config.attn_config["attn_type"]}"""
+                f"""Not implemented attn {config.attn_config.attn_type}"""
             )
         resid_pdrop = config.resid_pdrop
         if config.no_bias:
@@ -783,19 +782,21 @@ class MPTPreTrainedModel(PreTrainedModel):
 
 
 class MPTModel(MPTPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         # config._validate_config()
         super().__init__(config)
         self.world_size = weights.process_group.size()
         self.rank = weights.process_group.rank()
         self.n_heads = config.n_heads
-        self.attn_impl = config.attn_config["attn_impl"]
-        self.prefix_lm = config.attn_config["prefix_lm"]
-        self.attn_uses_sequence_id = config.attn_config["attn_uses_sequence_id"]
-        self.alibi = config.attn_config["alibi"]
-        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
+        self.attn_impl = config.attn_config.attn_impl
+        self.prefix_lm = config.attn_config.prefix_lm
+        self.attn_uses_sequence_id = config.attn_config.attn_uses_sequence_id
+        self.alibi = config.attn_config.alibi
+        self.alibi_bias_max = config.attn_config.alibi_bias_max
         if config.init_device == "mixed":
-            if dist.get_local_rank() == 0:
+            # TODO: reimplement mixed device initialization
+            # dist.get_local_rank() == 0:
+            if True:
                 config.init_device = "cpu"
             else:
                 config.init_device = "meta"
@@ -809,13 +810,13 @@ class MPTModel(MPTPreTrainedModel):
                 f"Requested norm type ({config.norm_type}) is not implemented within this repo."
             )
 
-        self.wte = TensorParallelEmbedding("transformer.wte", weights)
+        self.wte = TensorParallelEmbedding(f"{prefix}.wte", weights)
 
         if not self.alibi:
-            self.wpe = TensorParallelEmbedding("transformer.wpe", weights)
+            self.wpe = TensorParallelEmbedding(f"{prefix}.wpe", weights)
         self.blocks = nn.ModuleList(
             [
-                MPTBlock(config, prefix=f"transformer.blocks.{i}", weights=weights)
+                MPTBlock(config, prefix=f"{prefix}.blocks.{i}", weights=weights)
                 for i in range(config.n_layers)
             ]
         )
@@ -1016,7 +1017,7 @@ class MPTModel(MPTPreTrainedModel):
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
                     raise ValueError(
-                        f"past_key_values must provide a past_key_value for each attention "
+                        "past_key_values must provide a past_key_value for each attention "
                         + f"layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r})."
                     )
                 past_position = past_key_values[0][0].size(1)
@@ -1085,13 +1086,19 @@ class MPTModel(MPTPreTrainedModel):
 
 
 class MPTForCausalLM(MPTPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
         if not config.tie_word_embeddings:
             raise ValueError("MPTForCausalLM only supports tied word embeddings")
-        self.transformer = MPTModel(config, weights)
+        self.transformer = MPTModel(prefix, config, weights)
         self.lm_head = SpeculativeHead.load(
-            config, prefix="transformer.wte", weights=weights
+            config, prefix=f"{prefix}.wte", weights=weights
         )
         self.logit_scale = None
         if config.logit_scale is not None:
@@ -1176,7 +1183,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
             input_ids = input_ids[:, -1].unsqueeze(-1)
         if self.transformer.prefix_lm:
             prefix_mask = torch.ones_like(attention_mask)
-            if kwargs.get("use_cache") == False:
+            if kwargs.get("use_cache") is False:
                 raise NotImplementedError(
                     "MPT with prefix_lm=True does not support use_cache=False."
                 )
diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py
index fcad32fa79c0cc4cf4b1ad96f64cb290c79ed1f6..06731a6f99826d8a0e2bd4ae78f066b146fd8a08 100644
--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -21,25 +21,14 @@ import torch
 import torch.distributed
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import CrossEntropyLoss
 
 from transformers.activations import ACT2FN
-from transformers.file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
-from transformers import GPTNeoXConfig
-from loguru import logger
 from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
@@ -133,7 +122,6 @@ class GPTNeoXAttention(nn.Module):
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_attention_heads
         self.rotary_ndims = int(self.head_size * config.rotary_pct)
-        max_positions = config.max_position_embeddings
         # ??? TODO
         # self.register_buffer(
         #     "bias",
@@ -404,24 +392,24 @@ class GPTNeoXMLP(nn.Module):
 
 
 class GPTNeoXLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, layer_id, prefix: str, config, weights):
         super().__init__()
         self.use_parallel_residual = config.use_parallel_residual
         self.input_layernorm = nn.LayerNorm.load(
-            prefix=f"gpt_neox.layers.{layer_id}.input_layernorm",
+            prefix=f"{prefix}.layers.{layer_id}.input_layernorm",
             weights=weights,
             eps=config.layer_norm_eps,
         )
         self.post_attention_layernorm = nn.LayerNorm.load(
-            prefix=f"gpt_neox.layers.{layer_id}.post_attention_layernorm",
+            prefix=f"{prefix}.layers.{layer_id}.post_attention_layernorm",
             weights=weights,
             eps=config.layer_norm_eps,
         )
         self.attention = GPTNeoXAttention(
-            config, prefix=f"gpt_neox.layers.{layer_id}.attention", weights=weights
+            config, prefix=f"{prefix}.layers.{layer_id}.attention", weights=weights
         )
         self.mlp = GPTNeoXMLP(
-            config, prefix=f"gpt_neox.layers.{layer_id}.mlp", weights=weights
+            config, prefix=f"{prefix}.layers.{layer_id}.mlp", weights=weights
         )
 
     def forward(
@@ -472,23 +460,23 @@ class GPTNeoXLayer(nn.Module):
 
 
 class GPTNeoXModel(GPTNeoXPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
         self.config = config
 
         self.num_attention_heads = config.num_attention_heads
 
         self.embed_in = TensorParallelEmbedding(
-            prefix="gpt_neox.embed_in", weights=weights
+            prefix=f"{prefix}.embed_in", weights=weights
         )
         self.layers = nn.ModuleList(
             [
-                GPTNeoXLayer(layer_id, config, weights)
+                GPTNeoXLayer(layer_id, prefix, config, weights)
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
         self.final_layer_norm = nn.LayerNorm.load(
-            prefix="gpt_neox.final_layer_norm",
+            prefix=f"{prefix}.final_layer_norm",
             weights=weights,
             eps=config.layer_norm_eps,
         )
@@ -640,9 +628,15 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
 class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
-        self.gpt_neox = GPTNeoXModel(config, weights)
+
+        if not prefix:
+            prefix = "gpt_neox"
+        else:
+            prefix = f"{prefix}.gpt_neox"
+
+        self.gpt_neox = GPTNeoXModel(prefix, config, weights)
         self.embed_out = SpeculativeHead.load(
             config, prefix="embed_out", weights=weights
         )
diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py
index 9b2d01e0763e75961d5bbbe5529796ec211a6d40..bd44032145aad79fbfd429f700822df487b09ddf 100644
--- a/server/text_generation_server/models/custom_modeling/opt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py
@@ -94,11 +94,13 @@ class OPTLearnedPositionalEmbedding(nn.Module):
     This module learns positional embeddings up to a fixed maximum size.
     """
 
-    def __init__(self, weights):
+    def __init__(self, prefix: str, weights):
         super().__init__()
         self.offset = 2
         self.weight = nn.Parameter(
-            weights.get_tensor("model.decoder.embed_positions.weight")
+            weights.get_tensor(
+                f"{prefix + '.' if prefix else ''}decoder.embed_positions.weight"
+            )
         )
 
     def forward(
@@ -311,11 +313,11 @@ class OPTAttention(nn.Module):
 
 
 class OPTDecoderLayer(nn.Module):
-    def __init__(self, layer_id: int, config: OPTConfig, weights):
+    def __init__(self, layer_id: int, prefix: str, config: OPTConfig, weights):
         super().__init__()
         self.process_group = weights.process_group
         self.hidden_size = config.hidden_size
-        prefix = f"model.decoder.layers.{layer_id}"
+        prefix = f"{prefix + '.' if prefix else ''}decoder.layers.{layer_id}"
         self.self_attn = OPTAttention(
             config,
             prefix=f"{prefix}.self_attn",
@@ -429,7 +431,7 @@ class OPTPreTrainedModel(PreTrainedModel):
 
 
 class OPTDecoder(OPTPreTrainedModel):
-    def __init__(self, config: OPTConfig, weights):
+    def __init__(self, prefix: str, config: OPTConfig, weights):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.layerdrop
@@ -437,21 +439,29 @@ class OPTDecoder(OPTPreTrainedModel):
         self.max_target_positions = config.max_position_embeddings
         self.vocab_size = config.vocab_size
 
+        prefix = prefix + "." if prefix else ""
+
         self.embed_tokens = TensorParallelEmbedding(
-            prefix="model.decoder.embed_tokens", weights=weights
+            prefix=f"{prefix}decoder.embed_tokens", weights=weights
         )
-        self.embed_positions = OPTLearnedPositionalEmbedding(weights)
+        self.embed_positions = OPTLearnedPositionalEmbedding(prefix, weights)
 
         if config.word_embed_proj_dim != config.hidden_size:
             self.project_out = FastLinear.load(
-                config, prefix="model.decoder.project_out", weights=weights, bias=False
+                config,
+                prefix=f"{prefix}decoder.project_out",
+                weights=weights,
+                bias=False,
             )
         else:
             self.project_out = None
 
         if config.word_embed_proj_dim != config.hidden_size:
             self.project_in = FastLinear.load(
-                config, prefix="model.decoder.project_in", weights=weights, bias=False
+                config,
+                prefix=f"{prefix}decoder.project_in",
+                weights=weights,
+                bias=False,
             )
         else:
             self.project_in = None
@@ -461,14 +471,14 @@ class OPTDecoder(OPTPreTrainedModel):
         # see https://github.com/facebookresearch/metaseq/pull/164
         if config.do_layer_norm_before and not config._remove_final_layer_norm:
             self.final_layer_norm = nn.LayerNorm.load(
-                prefix="model.decoder.final_layer_norm", weights=weights, eps=EPS
+                prefix=f"{prefix}decoder.final_layer_norm", weights=weights, eps=EPS
             )
         else:
             self.final_layer_norm = None
 
         self.layers = nn.ModuleList(
             [
-                OPTDecoderLayer(layer_id, config, weights)
+                OPTDecoderLayer(layer_id, prefix, config, weights)
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
@@ -686,9 +696,9 @@ class OPTDecoder(OPTPreTrainedModel):
 
 
 class OPTModel(OPTPreTrainedModel):
-    def __init__(self, config: OPTConfig, weights):
+    def __init__(self, prefix: str, config: OPTConfig, weights):
         super().__init__(config)
-        self.decoder = OPTDecoder(config, weights)
+        self.decoder = OPTDecoder(prefix, config, weights)
         # Initialize weights and apply final processing
 
     def forward(
@@ -743,13 +753,15 @@ class OPTModel(OPTPreTrainedModel):
 
 
 class OPTForCausalLM(OPTPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__(config)
 
-        self.model = OPTModel(config, weights)
+        self.model = OPTModel(prefix, config, weights)
 
         self.lm_head = SpeculativeHead.load(
-            config, prefix="model.decoder.embed_tokens", weights=weights
+            config,
+            prefix=f"{prefix + '.' if prefix else ''}decoder.embed_tokens",
+            weights=weights,
         )
 
     def forward(
diff --git a/server/text_generation_server/models/custom_modeling/phi_modeling.py b/server/text_generation_server/models/custom_modeling/phi_modeling.py
index 04b470eb7d9ecd17313fc6bab747a7cdddc12dd3..3f2ed010fbb8f1604be21d3f3ca0170b7241c81c 100644
--- a/server/text_generation_server/models/custom_modeling/phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/phi_modeling.py
@@ -5,7 +5,7 @@ import torch.distributed
 
 import math
 from torch import nn
-from typing import Optional, List, Tuple, Any
+from typing import Optional, List, Tuple
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
@@ -248,16 +248,16 @@ class PhiBlock(nn.Module):
 
 # PhiModel implements the embedding layer and the transformer blocks.
 class PhiModel(nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
         self.tp_rank = weights.process_group.rank()
         self.tp_world_size = weights.process_group.size()
         self.embed_tokens = TensorParallelEmbedding(
-            prefix="transformer.embd.wte", weights=weights
+            prefix=f"{prefix}.embd.wte", weights=weights
         )
         self.blocks = nn.ModuleList(
             [
-                PhiBlock(f"transformer.h.{layer_id}", config, weights)
+                PhiBlock(f"{prefix}.h.{layer_id}", config, weights)
                 for layer_id in range(config.n_layer)
             ]
         )
@@ -289,9 +289,15 @@ class PhiModel(nn.Module):
 
 # PhiForCausalLM wraps the PhiModel and PhiCausalLMHead together and returns a CausalLMOutputWithPast object.
 class PhiForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
-        self.model = PhiModel(config, weights)
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        self.model = PhiModel(prefix, config, weights)
         self.lm_head = PhiCausalLMHead(config, weights)
 
     def forward(
diff --git a/server/text_generation_server/models/custom_modeling/siglip.py b/server/text_generation_server/models/custom_modeling/siglip.py
index 5fbc6d2930ed5ccfd1174a68400320da56218c74..95ac9edee2009a8c047d8563452265d9a2c104af 100644
--- a/server/text_generation_server/models/custom_modeling/siglip.py
+++ b/server/text_generation_server/models/custom_modeling/siglip.py
@@ -1,20 +1,15 @@
-from typing import Optional, Tuple, Union
-
+from typing import Optional, Tuple
+import warnings
 import math
 import torch
 from torch import nn
 
 from transformers.activations import ACT2FN
-from transformers.modeling_attn_mask_utils import (
-    _create_4d_causal_attention_mask,
-    _prepare_4d_attention_mask,
-)
 from transformers.modeling_outputs import (
-    BaseModelOutput,
     BaseModelOutputWithPooling,
-    ImageClassifierOutput,
 )
-from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
+from transformers import SiglipConfig, SiglipVisionConfig
+from torch.nn.init import _calculate_fan_in_and_fan_out
 
 from text_generation_server.layers.tensor_parallel import (
     TensorParallelEmbedding,
@@ -244,9 +239,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
         return hidden_state[:, 0]
 
 
-import warnings
-
-
 def _trunc_normal_(tensor, mean, std, a, b):
     # Cut & paste from PyTorch official master until it's in a few official releases - RW
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
@@ -264,12 +256,12 @@ def _trunc_normal_(tensor, mean, std, a, b):
     # Values are generated by using a truncated uniform distribution and
     # then using the inverse CDF for the normal distribution.
     # Get upper and lower cdf values
-    l = norm_cdf((a - mean) / std)
-    u = norm_cdf((b - mean) / std)
+    lower = norm_cdf((a - mean) / std)
+    upper = norm_cdf((b - mean) / std)
 
     # Uniformly fill tensor with values from [l, u], then translate to
     # [2l-1, 2u-1].
-    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    tensor.uniform_(2 * lower - 1, 2 * upper - 1)
 
     # Use inverse cdf transform for normal distribution to get truncated
     # standard normal
@@ -313,9 +305,6 @@ def trunc_normal_tf_(
         tensor.mul_(std).add_(mean)
 
 
-from torch.nn.init import _calculate_fan_in_and_fan_out
-
-
 def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
     fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
     if mode == "fan_in":
@@ -349,9 +338,6 @@ def default_flax_embed_init(tensor):
     variance_scaling_(tensor, mode="fan_in", distribution="normal")
 
 
-from transformers import PreTrainedModel
-
-
 class SiglipEncoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
@@ -378,7 +364,6 @@ class SiglipEncoder(nn.Module):
         inputs_embeds,
         attention_mask: Optional[torch.Tensor] = None,
     ):
-
         hidden_states = inputs_embeds
         for idx, encoder_layer in enumerate(self.layers):
             hidden_states, _ = encoder_layer(
@@ -393,7 +378,6 @@ class SiglipVisionTransformer(nn.Module):
     def __init__(self, prefix, config: SiglipVisionConfig, weights):
         super().__init__()
         self.config = config
-        embed_dim = config.hidden_size
 
         self.embeddings = SiglipVisionEmbeddings(
             prefix=f"{prefix}.embeddings", config=config, weights=weights
@@ -401,20 +385,11 @@ class SiglipVisionTransformer(nn.Module):
         self.encoder = SiglipEncoder(
             prefix=f"{prefix}.encoder", config=config, weights=weights
         )
-        self.post_layernorm = nn.LayerNorm.load(
-            prefix=f"{prefix}.post_layernorm",
-            weights=weights,
-            eps=config.layer_norm_eps,
-        )
 
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
     ):
-        r"""
-        Returns:
-
-        """
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -427,10 +402,9 @@ class SiglipVisionTransformer(nn.Module):
             inputs_embeds=hidden_states,
         )
         last_hidden_state = encoder_outputs
-        post_last_hidden_state = self.post_layernorm(last_hidden_state)
 
         return BaseModelOutputWithPooling(
-            last_hidden_state=post_last_hidden_state,
+            last_hidden_state=last_hidden_state,
             # pooler_output=pooled_output,
             # hidden_states=encoder_outputs,
         )
diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py
index 0b899fba1b0c545db36437c07d014dcecca3539f..e6666acd330dd5def455c0bd7ca92d26d919f2aa 100644
--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -45,6 +45,15 @@ from text_generation_server.layers import (
     SpeculativeHead,
 )
 
+# copied from https://github.com/huggingface/transformers/blob/cd4584e3c809bb9e1392ccd3fe38b40daba5519a/src/transformers/models/t5/modeling_t5.py#L1316
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
+`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
+If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
+num_heads)`.
+"""
+
 
 class PartialTPEmbedding(nn.Module):
     def __init__(self, prefix: str, weights):
@@ -1132,12 +1141,12 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
         if labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-100)
             # move labels to correct device to enable PP
-            labels = labels.to(lm_logits.device)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
             # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
         if not return_dict:
-            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            output = (logits,) + decoder_outputs[1:] + encoder_outputs
             return ((loss,) + output) if loss is not None else output
 
         return (
diff --git a/server/text_generation_server/models/custom_modeling/vlm.py b/server/text_generation_server/models/custom_modeling/vlm.py
index b74b43ff9c366d08e7919404f749a9f86b0c9f8f..e5c44045a703191e8c7cf3e0c379ae57e8bbcf64 100644
--- a/server/text_generation_server/models/custom_modeling/vlm.py
+++ b/server/text_generation_server/models/custom_modeling/vlm.py
@@ -42,7 +42,7 @@ def load_vision_model(prefix, config, weights):
         )
 
         return SiglipVisionTransformer(
-            prefix=f"vision_tower.vision_model", config=config, weights=weights
+            prefix="vision_tower.vision_model", config=config, weights=weights
         )
     else:
         raise RuntimeError(f"Unsupported model type {config.model_type}")
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
old mode 100755
new mode 100644
index e09112cf74efeec1652e98a7ef3c113528a1657a..dd796901adf08188d02773f87c118329b16b6015
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1,7 +1,7 @@
+from contextlib import nullcontext
 import math
 import os
 import time
-import itertools
 import torch
 import torch.distributed
 
@@ -10,17 +10,41 @@ import numpy as np
 from loguru import logger
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import PreTrainedTokenizerBase
-from typing import Iterable, Optional, Tuple, List, Type, Dict
+from transformers import (
+    PreTrainedTokenizerBase,
+    AutoConfig,
+    AutoTokenizer,
+    GenerationConfig,
+)
+from typing import (
+    Any,
+    ContextManager,
+    Iterable,
+    Optional,
+    Tuple,
+    List,
+    Type,
+    Dict,
+    Union,
+)
 
 from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from text_generation_server.utils.chunks import concat_text_chunks
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models import Model
+from text_generation_server.utils.log import log_master
+from text_generation_server.utils.prefill_chunking import (
+    get_support_chunking,
+    get_max_prefill_tokens,
+)
 from text_generation_server.utils.tokens import batch_top_tokens
-from text_generation_server.utils.dist import RANK
 from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
 from text_generation_server.models.types import (
     Batch,
     Tokens,
@@ -30,15 +54,16 @@ from text_generation_server.models.types import (
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.globals import (
     MEM_POOL,
-    FLASH_DECODING,
+    ATTENTION,
     BLOCK_SIZE,
     CUDA_GRAPHS,
+    TGI_WIGGLE_ROOM,
     get_adapter_to_index,
-    MODEL_ID,
 )
-from text_generation_server.layers.attention import Seqlen
+from text_generation_server.layers.attention import KVCache, Seqlen
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
+from text_generation_server.utils.quantization import get_loader
 from text_generation_server.utils.segments import SegmentConcatBuilder, find_segments
 
 from text_generation_server.utils.import_utils import (
@@ -46,10 +71,17 @@ from text_generation_server.utils.import_utils import (
     synchronize,
     get_free_memory,
 )
+from text_generation_server.models.metadata_kernels import (
+    has_triton,
+    copy_next_input_ids_inplace,
+    block_tables_to_ragged,
+    block_tables_to_padded,
+    prepare_position_slot_ids,
+    slots_filtering,
+)
 
 tracer = trace.get_tracer(__name__)
 
-
 # Will be set in init
 SLIDING_WINDOW: Optional[int] = None
 
@@ -64,6 +96,40 @@ def get_sliding_windows() -> int:
     return SLIDING_WINDOW
 
 
+def init_cpu_threads_env(rank_id: int, world_size: int):
+    import importlib.util
+
+    if importlib.util.find_spec("numa") is not None:
+        import numa
+        import psutil
+
+        nodes = numa.info.get_max_node() + 1
+        rank_per_node = math.ceil(world_size / nodes)
+        num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
+        node_id = int(rank_id / rank_per_node)
+        rank_offset_per_node = rank_id % rank_per_node
+        if os.getenv("OMP_NUM_THREADS") is None:
+            num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
+        else:
+            num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
+        if len(numa.memory.get_membind_nodes()) == nodes:
+            numa.memory.set_membind_nodes((node_id))
+        torch.set_num_threads(num_cpus_per_rank)
+        if len(numa.schedule.get_affinitive_cpus(0)) == psutil.cpu_count(logical=True):
+            cpu_start = num_cpus_per_rank * rank_offset_per_node
+            numa.schedule.run_on_cpus(
+                0,
+                *(
+                    numa.info.node_to_cpus(node_id)[
+                        cpu_start : cpu_start + num_cpus_per_rank
+                    ]
+                ),
+            )
+        logger.info(
+            f"affinity={numa.schedule.get_affinitive_cpus(0)}, membind = {numa.memory.get_membind_nodes()}"
+        )
+
+
 @dataclass
 class FlashCausalLMBatch(Batch):
     batch_id: int
@@ -72,25 +138,17 @@ class FlashCausalLMBatch(Batch):
     requests_idx_mapping: Dict[int, int]
 
     # Decoder values
-    input_ids: torch.Tensor
-    position_ids: torch.Tensor
+    # Can be a list for easy filtering
+    # If `input_ids` is a list, it needs to be materialized to a tensor first
+    input_ids: Union[torch.Tensor, List[List[int]]]
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    position_ids: Optional[torch.Tensor]
     speculative_ids: Optional[torch.Tensor]
 
-    # Flash Attention values
-
-    # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
-    cu_seqlen_prefill: Optional[torch.Tensor]
-    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
-    # as we only keep SLIDING_WINDOW values instead of the whole tensor
-    prefill_cache_indices: Optional[torch.Tensor]
-
-    # Paged Attention values
-
     # Set when creating the batch
-    # CPU tensor of length b indicating the start of each sequence in slots
-    start_slots: torch.Tensor
     # tensor of indices of the currently used slots, length = \sum_{i=0}^{b} s_i in prefill, length = b in decode
-    slot_indices: torch.Tensor
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    slot_indices: Optional[torch.Tensor]
 
     # list of length b of list of length s_i // block_size
     block_tables: List[List[int]]
@@ -98,13 +156,32 @@ class FlashCausalLMBatch(Batch):
     block_tables_tensor: torch.Tensor
     # tensor of length \sum_{i=0}^{b} max_s_i  holding the paged attention slots for all sequences
     slots: torch.Tensor
+    # list of length b + 1  containing the cumulative sequence slot lengths of the sequences in the batch
+    # used for filtering
+    cu_slots: torch.Tensor
 
-    max_seqlen: int
+    max_input_length: int
+    max_current_length: int
+
+    # Whether this batch contains at least one request that is prefilling
+    prefilling: bool
+    # Whether each request is prefilling
+    prefilling_mask: List[bool]
 
     # Prefill metadata tensors to efficiently compute logprobs
+    # tensor of length b + 1  containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
+    cu_seqlen_prefill: Optional[torch.Tensor]
+    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
+    # as we only keep SLIDING_WINDOW values instead of the whole tensor
+    prefill_cache_indices: Optional[torch.Tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_head_indices: Optional[torch.Tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_next_token_indices: Optional[torch.tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_cu_outlens: Optional[List[int]]
+    # Will be set by `generate_token` and reset after each prefill forward
+    prefill_logprob_tokens: List[Optional[Tokens]]
 
     # All tokens
     all_input_ids: List[List[int]]
@@ -112,7 +189,14 @@ class FlashCausalLMBatch(Batch):
 
     # Lengths of all generations present in the batch
     input_lengths: List[int]
-    input_lengths_tensor: torch.Tensor
+    # size [b], containing the number of blocks that can be retrieved from the cache
+    cache_lengths: List[int]
+    prompt_lengths: List[int]
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    input_lengths_tensor: Optional[torch.Tensor]
+    cache_lengths_tensor: Optional[torch.Tensor]
+    prompt_lengths_tensor: torch.Tensor
+
     prefix_offsets: List[Optional[int]]
     read_offsets: List[Optional[int]]
 
@@ -123,7 +207,8 @@ class FlashCausalLMBatch(Batch):
     top_n_tokens_tensor: torch.Tensor
 
     # Adapter metadata for each request
-    adapter_meta: AdapterBatchMetadata
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    adapter_meta: Optional[AdapterBatchMetadata]
 
     # Number of blocks in this batch
     num_blocks: int
@@ -136,22 +221,32 @@ class FlashCausalLMBatch(Batch):
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.num_blocks * BLOCK_SIZE,
+            current_tokens=(
+                sum([len(i) for i in self.input_ids])
+                if isinstance(self.input_ids, list)
+                else len(self.input_ids)
+            ),
         )
 
     @classmethod
     def batch_tokenized_inputs(
         cls, requests: Iterable[generate_pb2.Request], tokenizer
     ):
-        batch_inputs = []
-        max_truncation = 0
+        max_length = 0
+        all_input_ids = []
+        batch_size = 0
         for r in requests:
-            batch_inputs.append(concat_text_chunks(r.input_chunks.chunks))
-            max_truncation = max(max_truncation, r.truncate)
-
-        batch_tokenized_inputs = tokenizer(
-            batch_inputs, truncation=True, max_length=max_truncation
-        )["input_ids"]
-        return batch_tokenized_inputs
+            batch_size += 1
+            inputs = concat_text_chunks(r.input_chunks.chunks)
+            input_ids = tokenizer(
+                inputs,
+                truncation=True,
+                max_length=r.truncate,
+                add_special_tokens=r.add_special_tokens,
+            )["input_ids"]
+            max_length = max(max_length, len(input_ids))
+            all_input_ids.append(input_ids)
+        return all_input_ids
 
     @classmethod
     def from_tokenized(
@@ -162,44 +257,32 @@ class FlashCausalLMBatch(Batch):
         dtype: torch.dtype,
         device: torch.device,
     ) -> "FlashCausalLMBatch":
-        sliding_window = get_sliding_windows()
-        position_ids = []
-        cu_seqlen_prefill = [0]
-        start_slots = []
-        slot_indices = []
-        prefill_cache_indices = []
+        speculate = get_speculate()
 
+        cache_lengths = []
         input_lengths = []
+        prompt_lengths = []
         prefix_offsets = []
         read_offsets = []
         all_input_ids = []
+        all_postfix_ids = []
         requests_idx_mapping = {}
-
-        all_prefill_logprobs = True
-        no_prefill_logprobs = True
-        prefill_head_indices = []
-        prefill_next_token_indices = []
-        prefill_cu_outlens = [0]
+        slots = []
+        cu_slots = [0]
 
         next_token_chooser_parameters = []
         stopping_criterias = []
         top_n_tokens = []
 
-        adapter_indices_list = []
-        adapter_set = set()
-
-        # Cumulative length
-        cumulative_length = 0
-        cumulative_max_length = 0
-        prefill_out_cumulative_length = 0
-
         num_blocks = 0
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
         max_length = 0
         max_blocks = 0
 
+        cu_blocks = [0]
         block_tables = []
-        slots = []
+        block_tables_ragged = []
 
         # Parse batch
         for i, (r, tokenized_input) in enumerate(
@@ -208,29 +291,46 @@ class FlashCausalLMBatch(Batch):
             # request id -> idx in list mapping
             requests_idx_mapping[r.id] = i
 
-            tokenized_input = tokenized_input[-r.truncate :]
-            '''fix input s=1 crash bug
-            if (
-                tokenized_input[0] == tokenizer.bos_token_id
-                and tokenized_input[1] == tokenizer.bos_token_id
-            ):
-                tokenized_input = tokenized_input[1:]
-            '''
+            prompt_length = len(tokenized_input)
+            prompt_lengths.append(prompt_length)
 
-            input_length = len(tokenized_input)
-            input_lengths.append(input_length)
+            cache_length = r.cache_len
 
-            prefix_offsets.append(input_length - 5)
-            read_offsets.append(input_length)
+            assert (
+                cache_length <= prompt_length
+            ), f"Prefix {cache_length} vs input {prompt_length}"
+            if cache_length == prompt_length:
+                assert False, "unreachable"
 
-            all_input_ids.append(tokenized_input)
+            # `chunk_len` is an optional field in the protobuf
+            # It is only set if the model support chunking
+            if r.HasField("chunk_len"):
+                input_length = r.chunk_len
+
+                if cache_length + input_length < prompt_length:
+                    # FIXME: speculate is not supported for context chunking at the moment
+                    assert speculate == 0
+                    assert get_support_chunking()
+                    assert input_length > 0
+
+                postfix_ids = tokenized_input[
+                    cache_length : cache_length + input_length
+                ]
+                assert (
+                    len(postfix_ids) == input_length
+                ), "Rust and Python tokenizers are not aligned"
+            else:
+                # Use all the remaining ids
+                postfix_ids = tokenized_input[cache_length:]
+                input_length = len(postfix_ids)
+
+            input_lengths.append(input_length)
 
-            # Position ids
-            request_position_ids = torch.arange(0, input_length, dtype=torch.int32)
-            position_ids.append(request_position_ids)
+            prefix_offsets.append(prompt_length - 5)
+            read_offsets.append(prompt_length)
 
-            # Add cumulative lengths of all previous inputs
-            cu_seqlen_prefill.append(cumulative_length + input_length)
+            all_postfix_ids.append(postfix_ids)
+            all_input_ids.append(tokenized_input)
 
             next_token_chooser_parameters.append(r.parameters)
 
@@ -241,20 +341,17 @@ class FlashCausalLMBatch(Batch):
             stopping_criterias.append(stopping_criteria)
             top_n_tokens.append(r.top_n_tokens)
 
-            ADAPTER_TO_INDEX = get_adapter_to_index()
-            adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
-            adapter_indices_list.append(torch.full((input_length,), adapter_index))
-            adapter_set.add(adapter_index)
-
             # Paged attention
             # Remove one as the first token des not have a past
             speculative_length = get_speculate()
             speculative_length = 0 if speculative_length is None else speculative_length
-            total_tokens = input_length + max_new_tokens - 1 + speculative_length
+
+            # Tokens that need to be mapped to blocks.
+            block_tokens = prompt_length + max_new_tokens - 1 + speculative_length
 
             # blocks and slots can be empty (for example in warmup)
             if not r.blocks:
-                needed_blocks = math.ceil(total_tokens / BLOCK_SIZE)
+                needed_blocks = math.ceil(block_tokens / BLOCK_SIZE)
                 request_blocks = [
                     b for b in range(num_blocks, num_blocks + needed_blocks)
                 ]
@@ -268,63 +365,27 @@ class FlashCausalLMBatch(Batch):
                 request_slots = r.slots
 
             block_tables.append(request_blocks)
-            slots.extend(request_slots[:total_tokens])
-            num_blocks += len(request_blocks)
-            start_slots.append(cumulative_max_length)
-
-            request_slot_indices = torch.arange(
-                cumulative_max_length,
-                cumulative_max_length + input_length,
-                dtype=torch.int64,
-            )
-            slot_indices.append(request_slot_indices)
-
-            # Create tensor to slice into the kv tensor in prefill
-            if sliding_window is not None:
-                request_prefill_cache_indices = torch.arange(
-                    cumulative_length + max(0, input_length - sliding_window),
-                    cumulative_length + input_length,
-                    dtype=torch.int64,
-                )
-                prefill_cache_indices.append(request_prefill_cache_indices)
+            block_tables_ragged.extend(request_blocks)
+            cu_blocks.append(len(block_tables_ragged))
 
-            all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs
-            no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs
+            slots.extend(request_slots)
+            cu_slots.append(len(slots))
 
-            if r.prefill_logprobs:
-                prefill_head_indices.append(request_position_ids + cumulative_length)
-                prefill_next_token_indices.append(
-                    prefill_out_cumulative_length + input_length - 1
-                )
-                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
-                prefill_out_cumulative_length += input_length
-            else:
-                prefill_head_indices.append(
-                    torch.tensor(
-                        [cumulative_length + input_length - 1], dtype=torch.int32
-                    )
-                )
-                prefill_next_token_indices.append(prefill_out_cumulative_length)
-                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
-                prefill_out_cumulative_length += 1
+            cache_lengths.append(cache_length)
+            num_blocks += len(request_blocks)
 
             # Update
-            cumulative_length += input_length
-            cumulative_max_length += total_tokens
-            max_seqlen = max(max_seqlen, input_length)
             max_blocks = max(max_blocks, len(request_blocks))
+            max_input_length = max(max_input_length, input_length)
+            max_current_length = max(max_current_length, cache_length + input_length)
             max_length = max(
-                max_length, input_length + max_new_tokens + speculative_length
+                max_length,
+                prompt_length + max_new_tokens + speculative_length,
             )
 
-        adapter_indices = torch.cat(adapter_indices_list).to(
-            dtype=torch.int64, device=device
-        )
-
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             next_token_chooser_parameters, dtype, device, tokenizer
         )
-        start_slots = torch.tensor(start_slots, dtype=torch.int64)
 
         # Padded all_input_ids_tensor
         all_input_ids_tensor = np.zeros(
@@ -338,81 +399,53 @@ class FlashCausalLMBatch(Batch):
             all_input_ids_tensor, dtype=torch.int64, device=device
         )
 
-        if len(pb.requests) > 1:
-            input_ids = np.concatenate(all_input_ids, dtype=np.int64)
-            position_ids = torch.cat(position_ids)
-            slot_indices = torch.cat(slot_indices)
-            if sliding_window is not None:
-                prefill_cache_indices = torch.cat(prefill_cache_indices)
-        else:
-            input_ids = all_input_ids[0]
-            position_ids = position_ids[0]
-            slot_indices = slot_indices[0]
-            if sliding_window is not None:
-                prefill_cache_indices = prefill_cache_indices[0]
-
-        cu_seqlen_prefill = torch.tensor(
-            cu_seqlen_prefill, device=device, dtype=torch.int32
-        )
-        position_ids = position_ids.to(device)
-        slot_indices = slot_indices.to(device)
-        prefill_cache_indices = (
-            prefill_cache_indices.to(device) if sliding_window is not None else None
-        )
-        input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
-        input_lengths_tensor = torch.tensor(
-            input_lengths, dtype=torch.int32, device=device
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
         )
 
-        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
-        adapter_segments = torch.tensor(
-            adapter_segments, dtype=torch.int32, device=device
+        block_tables_ragged = torch.tensor(
+            block_tables_ragged, device=device, dtype=torch.int32
+        )
+        cu_blocks = torch.tensor(cu_blocks, device=device, dtype=torch.int64)
+        block_tables_tensor = torch.empty(
+            (len(block_tables), max_blocks),
+            device=device,
+            dtype=torch.int32,
         )
 
-        if all_prefill_logprobs:
-            prefill_head_indices = None
-            prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
-        elif no_prefill_logprobs:
-            prefill_head_indices = cu_seqlen_prefill[1:] - 1
-            prefill_next_token_indices = None
-        else:
-            prefill_head_indices = torch.tensor(
-                torch.cat(prefill_head_indices), dtype=torch.int64, device=device
-            )
-            prefill_next_token_indices = torch.tensor(
-                prefill_next_token_indices, dtype=torch.int64, device=device
+        # If the device supports Triton, we can use a fused kernel
+        if has_triton():
+            block_tables_to_padded(
+                max_blocks, cu_blocks, block_tables_tensor, block_tables_ragged
             )
-        top_n_tokens_tensor = torch.tensor(
-            top_n_tokens, device=device, dtype=torch.int64
+        else:
+            for i, request_blocks in enumerate(block_tables):
+                block_tables_tensor[i, : len(request_blocks)] = torch.tensor(
+                    request_blocks
+                )
+
+        prompt_lengths_tensor = torch.tensor(
+            prompt_lengths, dtype=torch.int32, device=device
         )
 
         slots = torch.tensor(slots, dtype=torch.int64, device=device)
-        block_tables_tensor = torch.zeros(
-            (len(block_tables), max_blocks), dtype=torch.int32, device="cpu"
-        )
-        for i, request_blocks in enumerate(block_tables):
-            block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks)
-        block_tables_tensor = block_tables_tensor.to(device)
+        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)
 
         return cls(
             batch_id=pb.id,
             requests=pb.requests,
             requests_idx_mapping=requests_idx_mapping,
-            input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlen_prefill=cu_seqlen_prefill,
-            prefill_cache_indices=prefill_cache_indices,
-            start_slots=start_slots,
-            slot_indices=slot_indices,
+            input_ids=all_postfix_ids,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
-            slots=slots,
-            max_seqlen=max_seqlen,
-            prefill_head_indices=prefill_head_indices,
-            prefill_next_token_indices=prefill_next_token_indices,
-            prefill_cu_outlens=prefill_cu_outlens,
+            cache_lengths=cache_lengths,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=True,
+            prefilling_mask=[True] * len(pb.requests),
+            prefill_logprob_tokens=[None] * len(pb.requests),
             input_lengths=input_lengths,
-            input_lengths_tensor=input_lengths_tensor,
+            prompt_lengths=prompt_lengths,
             prefix_offsets=prefix_offsets,
             read_offsets=read_offsets,
             all_input_ids=all_input_ids,
@@ -423,13 +456,21 @@ class FlashCausalLMBatch(Batch):
             top_n_tokens_tensor=top_n_tokens_tensor,
             num_blocks=num_blocks,
             max_blocks=max_blocks,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
             speculative_ids=None,
+            prompt_lengths_tensor=prompt_lengths_tensor,
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids=None,
+            cu_seqlen_prefill=None,
+            prefill_cache_indices=None,
+            slot_indices=None,
+            slots=slots,
+            cu_slots=cu_slots,
+            prefill_head_indices=None,
+            prefill_next_token_indices=None,
+            prefill_cu_outlens=None,
+            cache_lengths_tensor=None,
+            input_lengths_tensor=None,
+            adapter_meta=None,
         )
 
     @classmethod
@@ -440,6 +481,7 @@ class FlashCausalLMBatch(Batch):
         dtype: torch.dtype,
         device: torch.device,
     ) -> "FlashCausalLMBatch":
+        assert len(pb.requests) > 0
         batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer)
         return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
 
@@ -451,7 +493,7 @@ class FlashCausalLMBatch(Batch):
         if len(request_ids) == len(self):
             return self
 
-        device = self.input_ids.device
+        device = self.block_tables_tensor.device
 
         # New values after filtering
         requests_idx_mapping = {}
@@ -459,23 +501,31 @@ class FlashCausalLMBatch(Batch):
         # Used to index into tensors
         indices = []
 
-        # slots to keep after filtering
-        slot_filtering_indices = torch.zeros(
-            self.slots.shape[0], dtype=torch.bool, device=device
-        )
+        if not has_triton():
+            # slots to keep after filtering
+            slot_filtering_indices = torch.zeros(
+                self.slots.shape[0], dtype=torch.bool, device=device
+            )
 
         # Create on CPU to only move to GPU once instead of at every copy
         slot_indices = torch.empty(len(request_ids), dtype=torch.int64)
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
 
         requests = []
-        start_slots = []
         block_tables = []
         all_input_ids = []
+        input_ids = []
 
+        prompt_lengths = []
         input_lengths = []
+        cache_lengths = []
         prefix_offsets = []
         read_offsets = []
+        cu_slots = [0]
+
+        prefilling_mask = []
+        prefill_logprob_tokens = []
 
         stopping_criterias = []
         top_n_tokens = []
@@ -483,8 +533,8 @@ class FlashCausalLMBatch(Batch):
 
         num_blocks = 0
         max_blocks = 0
-        # Cumulative length
-        cumulative_max_length = 0
+        max_slots = 0
+        cumulative_slot_tokens = 0
 
         for i, request_id in enumerate(request_ids):
             idx = self.requests_idx_mapping[request_id]
@@ -493,13 +543,23 @@ class FlashCausalLMBatch(Batch):
 
             requests.append(self.requests[idx])
 
+            # Prefilling
+            request_prefilling = self.prefilling_mask[idx]
+            prefilling_mask.append(request_prefilling)
+
             # Get length
             request_input_length = self.input_lengths[idx]
-            max_seqlen = max(max_seqlen, request_input_length)
+            request_cache_length = self.cache_lengths[idx]
+            max_input_length = max(max_input_length, request_input_length)
+            max_current_length = max(
+                max_current_length, request_cache_length + request_input_length
+            )
 
             all_input_ids.append(self.all_input_ids[idx])
 
+            prompt_lengths.append(self.prompt_lengths[idx])
             input_lengths.append(request_input_length)
+            cache_lengths.append(request_cache_length)
             prefix_offsets.append(self.prefix_offsets[idx])
             read_offsets.append(self.read_offsets[idx])
 
@@ -507,58 +567,87 @@ class FlashCausalLMBatch(Batch):
             stopping_criterias.append(stopping_criteria)
 
             top_n_tokens.append(self.top_n_tokens[idx])
+            prefill_logprob_tokens.append(self.prefill_logprob_tokens[idx])
 
             ADAPTER_TO_INDEX = get_adapter_to_index()
             adapter_index = ADAPTER_TO_INDEX.get(self.requests[idx].adapter_id, 0)
             adapter_set.add(adapter_index)
 
-            remaining_tokens = (
-                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
-            )
-
             request_block_table = self.block_tables[idx]
             num_blocks += len(request_block_table)
             block_tables.append(request_block_table)
-            start_slots.append(cumulative_max_length)
 
-            # Copy to tensor (CPU)
-            slot_indices[i] = cumulative_max_length + request_input_length - 1
+            start_slot = self.cu_slots[idx]
+            end_slot = self.cu_slots[idx + 1]
+            slot_length = end_slot - start_slot
+
+            if not has_triton():
+                # Set slice
+                slot_filtering_indices[start_slot:end_slot] = True
 
-            # Set slice
-            slot_filtering_indices[
-                self.start_slots[idx] : self.start_slots[idx]
-                + request_input_length
-                + remaining_tokens
-                - 1
-            ] = True
+            cu_slots.append(cumulative_slot_tokens + slot_length)
 
-            cumulative_max_length += request_input_length + remaining_tokens - 1
+            # Input ids if the request was part of a prefilling batch
+            # If the batch was decoding we can index into the tensor directly later
+            if self.prefilling:
+                input_ids.append(self.input_ids[idx])
+            else:
+                # Copy to tensor (CPU)
+                slot_indices[i] = cumulative_slot_tokens + request_cache_length
 
+            cumulative_slot_tokens += slot_length
             max_blocks = max(max_blocks, len(request_block_table))
+            max_slots = max(max_slots, slot_length)
 
-        # Index into tensors
-        input_ids = self.input_ids[indices]
-        position_ids = self.position_ids[indices]
-        adapter_indices = self.adapter_meta.adapter_indices[indices]
         all_input_ids_tensor = self.all_input_ids_tensor[indices]
         block_tables_tensor = self.block_tables_tensor[indices]
-        input_lengths_tensor = self.input_lengths_tensor[indices]
-        slots = self.slots[slot_filtering_indices]
         next_token_chooser = self.next_token_chooser.filter(indices)
         top_n_tokens_tensor = self.top_n_tokens_tensor[indices]
         speculative_ids = (
             self.speculative_ids[indices] if self.speculative_ids is not None else None
         )
+        prompt_lengths_tensor = self.prompt_lengths_tensor[indices]
 
-        start_slots = torch.tensor(start_slots, dtype=torch.int64)
+        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)
 
-        # Move to GPU now that we have the whole tensor
-        slot_indices = slot_indices.to(device)
+        if not has_triton():
+            slots = self.slots[slot_filtering_indices]
+        else:
+            slots = self.slots.new_empty(cumulative_slot_tokens)
+            gpu_cu_slots = cu_slots.to(device)
+            slots_indexing_start = self.cu_slots.to(device)[indices]
+            slots_filtering(
+                max_slots, self.slots, slots, gpu_cu_slots, slots_indexing_start
+            )
 
-        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
-        adapter_segments = torch.tensor(
-            adapter_segments, dtype=torch.int32, device=device
-        )
+        if self.prefilling:
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids = None
+            slot_indices = None
+            cache_lengths_tensor = None
+            input_lengths_tensor = None
+            adapter_meta = None
+        else:
+            # Index into tensors
+            input_ids = self.input_ids[indices]
+            position_ids = self.position_ids[indices]
+            adapter_indices = self.adapter_meta.adapter_indices[indices]
+            input_lengths_tensor = self.input_lengths_tensor[indices]
+            cache_lengths_tensor = self.cache_lengths_tensor[indices]
+
+            # Move to GPU now that we have the whole tensor
+            slot_indices = slot_indices.to(device)
+
+            adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+            adapter_segments = torch.tensor(
+                adapter_segments, dtype=torch.int32, device=device
+            )
+            adapter_meta = AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            )
 
         return type(self)(
             batch_id=self.batch_id,
@@ -568,17 +657,25 @@ class FlashCausalLMBatch(Batch):
             position_ids=position_ids,
             cu_seqlen_prefill=None,
             prefill_cache_indices=None,
-            start_slots=start_slots,
             slot_indices=slot_indices,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
             slots=slots,
-            max_seqlen=max_seqlen,
+            cu_slots=cu_slots,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=self.prefilling,
+            prefilling_mask=prefilling_mask,
             prefill_head_indices=None,
             prefill_next_token_indices=None,
             prefill_cu_outlens=None,
+            prefill_logprob_tokens=prefill_logprob_tokens,
+            prompt_lengths=prompt_lengths,
+            prompt_lengths_tensor=prompt_lengths_tensor,
             input_lengths=input_lengths,
             input_lengths_tensor=input_lengths_tensor,
+            cache_lengths=cache_lengths,
+            cache_lengths_tensor=cache_lengths_tensor,
             prefix_offsets=prefix_offsets,
             read_offsets=read_offsets,
             all_input_ids=all_input_ids,
@@ -590,12 +687,7 @@ class FlashCausalLMBatch(Batch):
             num_blocks=num_blocks,
             max_blocks=max_blocks,
             speculative_ids=speculative_ids,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
+            adapter_meta=adapter_meta,
         )
 
     @classmethod
@@ -605,39 +697,68 @@ class FlashCausalLMBatch(Batch):
         requests = []
         requests_idx_mapping = {}
 
+        prefilling = False
         num_blocks = 0
         total_batch_size = 0
         total_slots = 0
         max_blocks = 0
         max_length = 0
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
         for b in batches:
             total_batch_size += len(b)
+            max_blocks = max(max_blocks, b.max_blocks)
             total_slots += len(b.slots)
             num_blocks += b.num_blocks
             speculative_length = (
                 b.speculative_ids.shape[1] if b.speculative_ids is not None else 0
             )
-            max_blocks = max(max_blocks, b.max_blocks)
-            max_seqlen = max(max_seqlen, b.max_seqlen)
+            max_input_length = max(max_input_length, b.max_input_length)
+            max_current_length = max(max_current_length, b.max_current_length)
             max_length = max(
                 max_length,
                 max(
-                    input_length
+                    prompt_length
                     + stopping_criteria.max_new_tokens
                     + speculative_length
-                    - stopping_criteria.current_tokens
-                    for input_length, stopping_criteria in zip(
-                        b.input_lengths, b.stopping_criterias
+                    for prompt_length, stopping_criteria in zip(
+                        b.prompt_lengths, b.stopping_criterias
                     )
                 ),
             )
+            prefilling = prefilling or b.prefilling
 
-        input_ids = batches[0].input_ids.new_empty(total_batch_size)
-        position_ids = batches[0].position_ids.new_empty(total_batch_size)
         slots = batches[0].slots.new_empty(total_slots)
-        slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
-        input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
+        cu_slots = torch.zeros(total_batch_size + 1, dtype=torch.int64)
+        if prefilling:
+            input_ids = []
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids = None
+            slot_indices = None
+            cache_lengths_tensor = None
+            input_lengths_tensor = None
+            adapter_meta = None
+            adapter_segment_builder = None
+        else:
+            input_ids = batches[0].input_ids.new_empty(total_batch_size)
+            position_ids = batches[0].position_ids.new_empty(total_batch_size)
+            slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
+            input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
+                total_batch_size
+            )
+            cache_lengths_tensor = batches[0].cache_lengths_tensor.new_empty(
+                total_batch_size
+            )
+            total_indices_size = sum(
+                b.adapter_meta.adapter_indices.shape[0] for b in batches
+            )
+            adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
+                total_indices_size
+            )
+            adapter_segment_builder = SegmentConcatBuilder()
+            adapter_set = set()
+
+        prompt_lengths_tensor = batches[0].prompt_lengths_tensor.new_empty(
             total_batch_size
         )
         block_tables_tensor = batches[0].block_tables_tensor.new_zeros(
@@ -649,27 +770,23 @@ class FlashCausalLMBatch(Batch):
         top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
             total_batch_size,
         )
-        total_indices_size = sum(
-            b.adapter_meta.adapter_indices.shape[0] for b in batches
-        )
-        adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
-            total_indices_size
-        )
-        adapter_set = set()
-        adapter_segment_builder = SegmentConcatBuilder()
 
-        start_slots = []
         block_tables = []
+        cache_lengths = []
         all_input_ids = []
 
+        prompt_lengths = []
         input_lengths = []
         prefix_offsets = []
         read_offsets = []
 
+        prefill_logprob_tokens = []
+
         next_token_chooser_parameters = []
         fsm_grammar_states = []
         stopping_criterias = []
         top_n_tokens = []
+        prefilling_mask = []
 
         # Cumulative length
         cumulative_batch_size = 0
@@ -688,32 +805,9 @@ class FlashCausalLMBatch(Batch):
 
             start_index = cumulative_batch_size
             end_index = cumulative_batch_size + len(batch)
-            slots_start_index = cumulative_slots
-            slots_end_index = cumulative_slots + len(batch.slots)
 
             # Copy tensors (GPU)
-            input_ids[start_index:end_index] = batch.input_ids
-            position_ids[start_index:end_index] = batch.position_ids
-            slot_indices[start_index:end_index] = batch.slot_indices + cumulative_slots
-            input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor
             top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
-            slots[slots_start_index:slots_end_index] = batch.slots
-
-            # Copy over adapter indices
-            adapter_start_index = cumulative_adapter_indices_size
-            adapter_end_index = (
-                cumulative_adapter_indices_size
-                + batch.adapter_meta.adapter_indices.shape[0]
-            )
-            adapter_indices[adapter_start_index:adapter_end_index] = (
-                batch.adapter_meta.adapter_indices
-            )
-            cumulative_adapter_indices_size = adapter_end_index
-            adapter_set.update(batch.adapter_meta.adapter_set)
-            adapter_segment_builder.concat(
-                batch.adapter_meta.adapter_segments, batch.adapter_meta.segment_indices
-            )
-
             all_input_ids_tensor[
                 start_index:end_index, : batch.all_input_ids_tensor.shape[1]
             ] = batch.all_input_ids_tensor[:, :max_length]
@@ -721,16 +815,56 @@ class FlashCausalLMBatch(Batch):
             block_tables_tensor[
                 start_index:end_index, : batch.block_tables_tensor.shape[1]
             ] = batch.block_tables_tensor[:, :max_blocks]
+            prompt_lengths_tensor[start_index:end_index] = batch.prompt_lengths_tensor
+
+            slots_start_index = cumulative_slots
+            slots_end_index = cumulative_slots + len(batch.slots)
+            slots[slots_start_index:slots_end_index] = batch.slots
+            cu_slots[start_index + 1 : end_index + 1] = (
+                batch.cu_slots[1:] + cumulative_slots
+            )
 
-            start_slots.append(batch.start_slots + cumulative_slots)
+            if not prefilling:
+                input_ids[start_index:end_index] = batch.input_ids
+                position_ids[start_index:end_index] = batch.position_ids
+                slot_indices[start_index:end_index] = (
+                    batch.slot_indices + cumulative_slots
+                )
+                input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor
+                cache_lengths_tensor[start_index:end_index] = batch.cache_lengths_tensor
+
+                # Copy over adapter indices
+                adapter_start_index = cumulative_adapter_indices_size
+                adapter_end_index = (
+                    cumulative_adapter_indices_size
+                    + batch.adapter_meta.adapter_indices.shape[0]
+                )
+                adapter_indices[adapter_start_index:adapter_end_index] = (
+                    batch.adapter_meta.adapter_indices
+                )
+                cumulative_adapter_indices_size = adapter_end_index
+                adapter_set.update(batch.adapter_meta.adapter_set)
+                adapter_segment_builder.concat(
+                    batch.adapter_meta.adapter_segments,
+                    batch.adapter_meta.segment_indices,
+                )
+            else:
+                if isinstance(batch.input_ids, torch.Tensor):
+                    batch.input_ids = batch.input_ids.view(-1, 1).tolist()
+                input_ids.extend(batch.input_ids)
 
+            prefilling_mask.extend(batch.prefilling_mask)
             block_tables.extend(batch.block_tables)
+            cache_lengths.extend(batch.cache_lengths)
             all_input_ids.extend(batch.all_input_ids)
 
+            prompt_lengths.extend(batch.prompt_lengths)
             input_lengths.extend(batch.input_lengths)
             prefix_offsets.extend(batch.prefix_offsets)
             read_offsets.extend(batch.read_offsets)
 
+            prefill_logprob_tokens.extend(batch.prefill_logprob_tokens)
+
             next_token_chooser_parameters.extend([r.parameters for r in batch.requests])
             fsm_grammar_states.extend(batch.next_token_chooser.fsm_grammar_states)
             stopping_criterias.extend(batch.stopping_criterias)
@@ -738,10 +872,8 @@ class FlashCausalLMBatch(Batch):
             top_n_tokens.extend(batch.top_n_tokens)
 
             # Update
-            cumulative_batch_size += len(batch)
             cumulative_slots += len(batch.slots)
-
-        start_slots = torch.concat(start_slots)
+            cumulative_batch_size += len(batch)
 
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             next_token_chooser_parameters,
@@ -757,7 +889,14 @@ class FlashCausalLMBatch(Batch):
             else None
         )
 
-        adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
+        if adapter_segment_builder is not None:
+            adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
+            adapter_meta = AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            )
 
         return cls(
             batch_id=batches[0].batch_id,
@@ -767,15 +906,23 @@ class FlashCausalLMBatch(Batch):
             position_ids=position_ids,
             cu_seqlen_prefill=None,
             prefill_cache_indices=None,
-            start_slots=start_slots,
             slot_indices=slot_indices,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
+            cache_lengths=cache_lengths,
+            cache_lengths_tensor=cache_lengths_tensor,
             slots=slots,
-            max_seqlen=max_seqlen,
+            cu_slots=cu_slots,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=prefilling,
+            prefilling_mask=prefilling_mask,
             prefill_head_indices=None,
             prefill_next_token_indices=None,
             prefill_cu_outlens=None,
+            prefill_logprob_tokens=prefill_logprob_tokens,
+            prompt_lengths=prompt_lengths,
+            prompt_lengths_tensor=prompt_lengths_tensor,
             input_lengths=input_lengths,
             input_lengths_tensor=input_lengths_tensor,
             prefix_offsets=prefix_offsets,
@@ -789,185 +936,568 @@ class FlashCausalLMBatch(Batch):
             num_blocks=num_blocks,
             max_blocks=max_blocks,
             speculative_ids=speculative_ids,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
+            adapter_meta=adapter_meta,
         )
 
-    def __len__(self):
-        return len(self.requests)
-
+    def prepare_for_prefill(self):
+        # Prepare values if we need to continue prefilling
+        # Speculation must be ignored while we prefill even with chunking
+        # it simplifies everything
+        assert self.speculative_ids is None
 
-class FlashCausalLM(Model):
-    def __init__(
-        self,
-        model_id: str,
-        model: torch.nn.Module,
-        tokenizer: PreTrainedTokenizerBase,
-        num_layers: int,
-        num_kv_heads: int,
-        head_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        rank: int = 0,
-        world_size: int = 1,
-        sliding_window: Optional[int] = None,
-    ):
-        self.num_layers = num_layers
-        self.num_kv_heads = num_kv_heads
-        self.head_size = head_size
+        device = self.block_tables_tensor.device
 
-        self.cuda_graphs = {}
-        self.kv_cache = []
+        if isinstance(self.input_ids, list):
+            if len(self) > 1:
+                input_ids = np.concatenate(self.input_ids, dtype=np.int64)
+            else:
+                input_ids = self.input_ids[0]
+            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
 
-        super(FlashCausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=False,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-            sliding_window=sliding_window,
+        self.input_lengths_tensor = torch.tensor(
+            self.input_lengths, dtype=torch.int32, device=device
+        )
+        self.cu_seqlen_prefill = torch.nn.functional.pad(
+            torch.cumsum(self.input_lengths_tensor, dim=0), (1, 0)
+        ).to(torch.int32)
+        self.cache_lengths_tensor = torch.tensor(
+            self.cache_lengths, dtype=torch.int32, device=device
         )
 
-    @property
-    def batch_type(self) -> Type[FlashCausalLMBatch]:
-        return FlashCausalLMBatch
+        # If the device supports Triton, we can use a fused kernel
+        if has_triton():
+            self.position_ids = torch.empty(
+                len(self.input_ids), dtype=torch.int32, device=device
+            )
+            self.slot_indices = torch.empty(
+                len(self.input_ids), dtype=torch.int64, device=device
+            )
+            cu_slots_gpu = self.cu_slots.to(device)
+
+            prepare_position_slot_ids(
+                self.max_input_length,
+                self.cache_lengths_tensor,
+                self.cu_seqlen_prefill,
+                cu_slots_gpu,
+                self.position_ids,
+                self.slot_indices,
+            )
 
-    def max_past(self) -> int:
-        return getattr(self.model, "max_past", None)
+        sliding_window = get_sliding_windows()
+        position_ids = []
+        slot_indices = []
+        prefill_cache_indices = []
+        all_prefill_logprobs = True
+        no_prefill_logprobs = True
+        prefill_cu_outlens = [0]
 
-    def init_kv_cache(
-        self,
-        num_blocks: int,
-        num_layers: int,
-        num_heads: int,
-        head_size: int,
+        # Cumulative length
+        cumulative_length = 0
+        cumulative_slot_tokens = 0
+        prefill_out_cumulative_length = 0
+
+        adapter_indices_list = []
+        adapter_set = set()
+
+        for i, (
+            r,
+            cache_length,
+            input_length,
+            prompt_length,
+            request_prefilling,
+            blocks,
+        ) in enumerate(
+            zip(
+                self.requests,
+                self.cache_lengths,
+                self.input_lengths,
+                self.prompt_lengths,
+                self.prefilling_mask,
+                self.block_tables,
+            )
+        ):
+            next_chunk_length = input_length
+
+            if not has_triton():
+                # Position ids
+                request_position_ids = torch.arange(
+                    cache_length, cache_length + input_length, dtype=torch.int32
+                )
+                position_ids.append(request_position_ids)
+
+                if not r.slots:
+                    request_slots = [
+                        s
+                        for b in blocks
+                        for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
+                    ]
+                else:
+                    request_slots = r.slots
+
+                request_slot_indices = torch.arange(
+                    cache_length + cumulative_slot_tokens,
+                    cache_length + cumulative_slot_tokens + input_length,
+                    dtype=torch.int64,
+                )
+
+                slot_indices.append(request_slot_indices)
+
+                # Update
+                cumulative_slot_tokens += len(request_slots)
+
+            # Create tensor to slice into the kv tensor in prefill
+            if sliding_window is not None:
+                request_prefill_cache_indices = torch.arange(
+                    cumulative_length + max(0, input_length - sliding_window),
+                    cumulative_length + input_length,
+                    dtype=torch.int64,
+                )
+
+            # Prefill logprobs is ignored if the request is done prefilling
+            prefill_logprobs = r.prefill_logprobs and request_prefilling
+
+            all_prefill_logprobs = all_prefill_logprobs and prefill_logprobs
+            no_prefill_logprobs = no_prefill_logprobs and not prefill_logprobs
+
+            if prefill_logprobs:
+                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
+                prefill_out_cumulative_length += input_length
+            else:
+                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
+                prefill_out_cumulative_length += 1
+
+            if sliding_window is not None:
+                prefill_cache_indices.append(request_prefill_cache_indices)
+
+            ADAPTER_TO_INDEX = get_adapter_to_index()
+            if ADAPTER_TO_INDEX:
+                adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
+                adapter_indices_list.append(
+                    torch.full((next_chunk_length,), adapter_index)
+                )
+                adapter_set.add(adapter_index)
+
+            # Update
+            cumulative_length += next_chunk_length
+
+        if not all_prefill_logprobs and not no_prefill_logprobs:
+            prefill_head_indices = []
+            prefill_next_token_indices = []
+
+            # Cumulative length
+            cumulative_length = 0
+            prefill_out_cumulative_length = 0
+
+            for i, (
+                r,
+                input_length,
+                request_prefilling,
+            ) in enumerate(
+                zip(
+                    self.requests,
+                    self.input_lengths,
+                    self.prefilling_mask,
+                )
+            ):
+                # Prefill logprobs is ignored if the request is done prefilling
+                prefill_logprobs = r.prefill_logprobs and request_prefilling
+
+                if prefill_logprobs:
+                    prefill_head_indices.append(
+                        torch.arange(
+                            cumulative_length,
+                            cumulative_length + input_length,
+                            dtype=torch.int64,
+                        )
+                    )
+                    prefill_next_token_indices.append(
+                        prefill_out_cumulative_length + input_length - 1
+                    )
+                    prefill_out_cumulative_length += input_length
+                else:
+                    prefill_head_indices.append(
+                        torch.tensor(
+                            [cumulative_length + input_length - 1],
+                            dtype=torch.int64,
+                        )
+                    )
+                    prefill_next_token_indices.append(prefill_out_cumulative_length)
+                    prefill_out_cumulative_length += 1
+
+                # Update
+                cumulative_length += input_length
+
+        if len(self) > 1:
+            if position_ids:
+                position_ids = torch.cat(position_ids)
+            if slot_indices:
+                slot_indices = torch.cat(slot_indices)
+            if sliding_window is not None:
+                prefill_cache_indices = torch.cat(prefill_cache_indices)
+        else:
+            if position_ids:
+                position_ids = position_ids[0]
+            if slot_indices:
+                slot_indices = slot_indices[0]
+            if sliding_window is not None:
+                prefill_cache_indices = prefill_cache_indices[0]
+
+        if not has_triton():
+            self.position_ids = position_ids.to(device)
+            self.slot_indices = slot_indices.to(device)
+
+        self.prefill_cu_outlens = prefill_cu_outlens
+        self.prefill_cache_indices = (
+            prefill_cache_indices.to(device) if sliding_window is not None else None
+        )
+
+        if all_prefill_logprobs:
+            prefill_head_indices = None
+            prefill_next_token_indices = self.cu_seqlen_prefill[1:] - 1
+        elif no_prefill_logprobs:
+            prefill_head_indices = self.cu_seqlen_prefill[1:] - 1
+            prefill_next_token_indices = None
+        else:
+            prefill_head_indices = torch.cat(prefill_head_indices).to(device)
+            prefill_next_token_indices = torch.tensor(
+                prefill_next_token_indices, dtype=torch.int64, device=device
+            )
+
+        self.prefill_head_indices = prefill_head_indices
+        self.prefill_next_token_indices = prefill_next_token_indices
+
+        if adapter_set:
+            adapter_indices = torch.cat(adapter_indices_list).to(
+                dtype=torch.int64, device=device
+            )
+            adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+        else:
+            adapter_indices = torch.zeros_like(self.input_ids)
+            adapter_segments = [0, len(adapter_indices)]
+            adapter_segment_indices = [len(adapter_indices) - 1]
+
+        adapter_segments = torch.tensor(
+            adapter_segments, dtype=torch.int32, device=device
+        )
+
+        self.adapter_meta = AdapterBatchMetadata(
+            adapter_indices=adapter_indices,
+            adapter_set=adapter_set,
+            adapter_segments=adapter_segments,
+            segment_indices=adapter_segment_indices,
+        )
+
+    def __len__(self):
+        return len(self.requests)
+
+
+ADAPTER_LAYERS = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}
+
+
+class FlashCausalLM(Model):
+    def __init__(
+        self,
+        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+        lora_adapter_ids: Optional[list] = [],
+        tokenizer_class: PreTrainedTokenizerBase = AutoTokenizer,
+        config_class: PreTrainedTokenizerBase = AutoConfig,
+        default_dtype=torch.float16,
+        aliases=None,
+        # Used for Santacoder override of config
+        num_kv_heads: Optional[int] = None,
+        # Deepseek V2 uses different QK and V dims.
+        head_size: Optional[int] = None,
+        skip_special_tokens: bool = True,
+        kv_cache_dtype: Optional[torch.dtype] = None,
+        support_chunking: bool = True,
+    ):
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = default_dtype if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                dtype = torch.bfloat16 if dtype is None else dtype
+                init_cpu_threads_env(rank_id=rank, world_size=world_size)
+        else:
+            raise NotImplementedError(f"{model_class} is only available on GPU")
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        try:
+            generation_config = GenerationConfig.from_pretrained(
+                model_id, revision=revision, trust_remote_code=trust_remote_code
+            )
+            if isinstance(generation_config.eos_token_id, (list, set)):
+                # TODO Huge hack
+                tokenizer._eos_token_ids = set(generation_config.eos_token_id)
+        except Exception:
+            pass
+
+        config = config_class.from_pretrained(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+
+        torch.distributed.barrier(group=self.process_group)
+
+        weights_loader = get_loader(quantize, model_id, revision)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device,
+            dtype,
+            process_group=self.process_group,
+            aliases=aliases,
+            weights_loader=weights_loader,
+        )
+
+        prefix = ""
+        model = model_class(prefix, config, weights)
+        torch.distributed.barrier(group=self.process_group)
+
+        # VLM models define the config we care about in their text_config
+        text_config = getattr(config, "text_config", None)
+        if text_config is not None:
+            config = text_config
+
+        if getattr(config, "sliding_window", None) is not None:
+            set_sliding_window(config.sliding_window)
+        else:
+            config.sliding_window = None
+
+        self.num_layers = config.num_hidden_layers
+        self.num_heads = config.num_attention_heads // self.process_group.size()
+        # Validation is done in the model itself
+        if num_kv_heads is None:
+            num_kv_heads = getattr(config, "num_key_value_heads", None)
+            # GPT-2 workaround
+            if num_kv_heads is None:
+                num_kv_heads = getattr(config, "n_head", None)
+        if num_kv_heads is None:
+            raise ValueError("Cannot get the number of key/value heads")
+        self.num_kv_heads = (
+            num_kv_heads // self.process_group.size()
+            if num_kv_heads > 1
+            else num_kv_heads
+        )
+        assert self.num_kv_heads > 0
+
+        if head_size is None:
+            # Some models use GQA and different sizes for o_proj
+            # and q_proj, that allows for that.
+            if hasattr(config, "head_dim"):
+                self.head_size = config.head_dim
+            else:
+                self.head_size = config.hidden_size // config.num_attention_heads
+        else:
+            self.head_size = head_size
+
+        self.cuda_graphs = {}
+        self.kv_cache = []
+        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
+
+        if ATTENTION == "flashinfer":
+            from text_generation_server.layers.attention.flashinfer import (
+                create_prefill_state,
+                create_decode_state,
+                create_prefill_with_paged_kv_state,
+            )
+
+            self.prefill_state = create_prefill_state(device=device)
+            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
+                device=device
+            )
+
+            self.decode_state = create_decode_state(
+                device=device,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+            )
+
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=False,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+            sliding_window=config.sliding_window,
+            support_chunking=support_chunking,
+        )
+
+    @property
+    def batch_type(self) -> Type[FlashCausalLMBatch]:
+        return FlashCausalLMBatch
+
+    def max_past(self) -> int:
+        return getattr(self.model, "max_past", None)
+
+    def init_kv_cache(
+        self,
+        num_blocks: int,
+        num_layers: int,
+        num_heads: int,
+        head_size: int,
         dtype: torch.dtype,
         device: torch.device,
     ):
         self.kv_cache = []
         empty_cache()
-
-        element_size = torch.tensor([], dtype=dtype).element_size()
-        if SYSTEM == "ipex" and device.type == "xpu":
-            x = 1
-        else:
-            x = BLOCK_SIZE // element_size
-
-        if FLASH_DECODING:
-            self.kv_cache = [
-                (
-                    torch.empty(
-                        (num_blocks, BLOCK_SIZE, num_heads, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                    torch.empty(
-                        (num_blocks, BLOCK_SIZE, num_heads, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                )
-                for _ in range(num_layers)
-            ]
-        elif SYSTEM == "ipex" and device == torch.device("cpu"):
-            self.kv_cache = [
-                (
-                    torch.empty(
-                        (num_blocks, num_heads, BLOCK_SIZE, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                    torch.empty(
-                        (num_blocks, num_heads, BLOCK_SIZE, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                )
-                for _ in range(num_layers)
-            ]
-        else:
-            self.kv_cache = [
-                (
-                    torch.empty(
-                        (num_blocks, num_heads, head_size // x, BLOCK_SIZE, x),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                    torch.empty(
-                        (num_blocks, num_heads, head_size, BLOCK_SIZE),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                )
-                for _ in range(num_layers)
-            ]
+        self.kv_cache = [
+            KVCache(
+                num_blocks=num_blocks,
+                num_heads=num_heads,
+                head_size=head_size,
+                dtype=dtype,
+                device=device,
+            )
+            for _ in range(num_layers)
+        ]
 
     def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
         input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
         position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
         slots = torch.arange(bs, dtype=torch.int64, device=self.device)
-        input_lengths = torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
-        block_tables = (
-            torch.arange(max_bt, dtype=torch.int32, device=self.device)
-            .repeat(bs)
-            .reshape((bs, max_bt))
+        input_lengths = [max_s] * bs
+        cache_lengths = [0] * bs
+        input_lengths_tensor = (
+            torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
         )
+        cache_lengths_tensor = torch.zeros(bs, dtype=torch.int32, device=self.device)
+        block_tables = torch.arange(
+            max_bt, dtype=torch.int32, device=self.device
+        ).repeat(bs)
+        block_tables = block_tables.reshape((bs, max_bt))
+
+        if ATTENTION == "flashinfer":
+            block_tables = block_tables_to_ragged(
+                block_tables=block_tables,
+                input_lengths=input_lengths,
+                cache_lengths=cache_lengths,
+                input_lengths_tensor=input_lengths_tensor,
+                cache_lengths_tensor=cache_lengths_tensor,
+                max_current_length=max_s,
+            )
+            from text_generation_server.layers.attention.flashinfer import (
+                create_decode_state_cuda_graphs,
+            )
 
+            block_tables_ptr = torch.zeros(
+                bs + 1, dtype=torch.int32, device=self.device
+            )
+            last_page_len = torch.ones(bs, dtype=torch.int32, device=self.device)
+            state = create_decode_state_cuda_graphs(
+                device=input_ids.device,
+                block_tables=block_tables,
+                block_tables_ptr=block_tables_ptr,
+                last_page_len=last_page_len,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+            )
+        else:
+            state = None
+
+        graph = torch.cuda.CUDAGraph()
         self.cuda_graphs[bs] = {
             "input_ids": input_ids,
             "position_ids": position_ids,
             "kv_cache": self.kv_cache,
             "block_tables": block_tables,
             "slots": slots,
-            "input_lengths": input_lengths,
+            "input_lengths": input_lengths_tensor,
+            "cache_lengths": cache_lengths_tensor,
+            "state": state,
+            "graph": graph,
         }
-        input_lengths_ = Seqlen(input_lengths=input_lengths)
-        graph = torch.cuda.CUDAGraph()
-        self.cuda_graphs[bs]["graph"] = graph
 
         torch.cuda.synchronize()
         # Run once outside to warmup
-        self.model.forward(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlen_prefill=None,
-            kv_cache=self.kv_cache,
+        with self._forward_context(
             block_tables=block_tables,
-            slots=slots,
-            input_lengths=input_lengths_,
-            max_s=max_s,
-            prefill_cache_indices=None,
-            lm_head_indices=None,
-        )
-        torch.cuda.synchronize()
-
-        with torch.cuda.graph(graph, pool=MEM_POOL):
-            input_lengths = Seqlen(input_lengths=input_lengths)
-            logits, speculative_logits = self.model.forward(
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=input_lengths_tensor,
+            state=state,
+            cache_lengths_tensor=cache_lengths_tensor,
+        ):
+            seqlen = Seqlen(
+                input_lengths=input_lengths_tensor,
+                cache_lengths=cache_lengths_tensor,
+                cu_seqlen_q=None,
+                max_q=1,
+                max_k=max_s,
+            )
+            self.model.forward(
                 input_ids=input_ids,
                 position_ids=position_ids,
                 cu_seqlen_prefill=None,
                 kv_cache=self.kv_cache,
                 block_tables=block_tables,
                 slots=slots,
-                input_lengths=input_lengths,
+                seqlen=seqlen,
                 max_s=max_s,
                 prefill_cache_indices=None,
                 lm_head_indices=None,
             )
-            self.cuda_graphs[bs]["logits"] = logits
-            self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
+            del seqlen
+
+            torch.cuda.synchronize()
+
+            with torch.cuda.graph(graph, pool=MEM_POOL):
+                seqlen = Seqlen(
+                    input_lengths=input_lengths_tensor,
+                    cache_lengths=cache_lengths_tensor,
+                    cu_seqlen_q=None,
+                    max_q=1,
+                    max_k=max_s,
+                )
+                logits, speculative_logits = self.model.forward(
+                    input_ids=input_ids,
+                    position_ids=position_ids,
+                    cu_seqlen_prefill=None,
+                    kv_cache=self.kv_cache,
+                    block_tables=block_tables,
+                    slots=slots,
+                    seqlen=seqlen,
+                    max_s=max_s,
+                    prefill_cache_indices=None,
+                    lm_head_indices=None,
+                )
+                self.cuda_graphs[bs]["logits"] = logits
+                self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
         torch.cuda.synchronize()
 
     def warmup(self, batch: FlashCausalLMBatch):
         # The warmup batch is the biggest batch we could ever receive
+        self.kv_cache = []
         empty_cache()
 
         try:
@@ -976,18 +1506,19 @@ class FlashCausalLM(Model):
                 self.num_layers,
                 self.num_kv_heads,
                 self.head_size,
-                self.dtype,
+                self.kv_cache_dtype,
                 self.device,
             )
             max_bt = batch.max_blocks
             max_s = max_bt * BLOCK_SIZE
 
-            # if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
-            #     torch.cuda.tunable.tuning_enable(False)
+            if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
+                # torch.cuda.tunable.tuning_enable(False)
+                pass
             _, batch, _ = self.generate_token(batch)
         except torch.cuda.OutOfMemoryError as e:
             raise RuntimeError(
-                f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
+                f"Not enough memory to handle {batch.to_pb().current_tokens} prefill tokens. "
                 f"You need to decrease `--max-batch-prefill-tokens`"
             ) from e
 
@@ -995,7 +1526,7 @@ class FlashCausalLM(Model):
 
         # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
         # Calculate the number of blocks that can be allocated with the free memory
-        dtype_size = torch.tensor([], dtype=self.dtype).element_size()
+        dtype_size = torch.tensor([], dtype=self.kv_cache_dtype).element_size()
         cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
         total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
 
@@ -1004,11 +1535,13 @@ class FlashCausalLM(Model):
 
         num_blocks = (
             # Leave 5% for some wiggle room
-            int((free_memory * 0.95) // total_cache_size)
+            int((free_memory * TGI_WIGGLE_ROOM) // total_cache_size)
             # Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
             + batch_num_blocks
         )
 
+        log_master(logger.info, f"KV-cache blocks: {num_blocks}, size: {BLOCK_SIZE}")
+
         del batch
 
         self.init_kv_cache(
@@ -1016,7 +1549,7 @@ class FlashCausalLM(Model):
             self.num_layers,
             self.num_kv_heads,
             self.head_size,
-            self.dtype,
+            self.kv_cache_dtype,
             self.device,
         )
 
@@ -1025,10 +1558,11 @@ class FlashCausalLM(Model):
                 os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
                 or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
             ):
-                torch.cuda.tunable.enable()
+                # torch.cuda.tunable.enable()
 
                 if os.environ.get("PYTORCH_TUNABLEOP_TUNING") != "0":
-                    torch.cuda.tunable.tuning_enable(True)
+                    # torch.cuda.tunable.tuning_enable(True)
+                    pass
 
                 if os.environ.get("PYTORCH_TUNABLEOP_SEQLENS") is not None:
                     tuning_sequences = [
@@ -1038,47 +1572,58 @@ class FlashCausalLM(Model):
                 elif CUDA_GRAPHS is not None:
                     tuning_sequences = CUDA_GRAPHS
                 else:
-                    # For seqlen = 1, we dispatch to LLMM1 kernel.
-                    tuning_sequences = [2, 3, 4, 5, 6, 7]
+                    tuning_sequences = [1, 2, 3, 4, 5, 6, 7]
 
                 tunableop_filepath = os.path.join(
                     HUGGINGFACE_HUB_CACHE,
-                    f"tunableop_{MODEL_ID.replace('/', '-')}_tp{self.world_size}_rank{self.rank}.csv",
+                    f"tunableop_{self.model_id.replace('/', '-')}_tp{self.world_size}_rank{self.rank}.csv",
                 )
 
-                logger.info(
-                    f"PyTorch TunableOp (https://github.com/fxmarty/pytorch/tree/2.3-patched/aten/src/ATen/cuda/tunable) is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`."
+                log_master(
+                    logger.info,
+                    f"PyTorch TunableOp is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
                 )
 
-                if os.path.isfile(tunableop_filepath):
-                    logger.info(
-                        f"The file {tunableop_filepath} already exists and will be reused."
-                    )
-                    torch.cuda.tunable.read_file(tunableop_filepath)
+                # torch.cuda.tunable.set_filename(
+                #     tunableop_filepath, insert_device_ordinal=False
+                # )
+
+                # if os.path.isfile(tunableop_filepath):
+                #     log_master(
+                #         logger.info,
+                #         f"The file {tunableop_filepath} already exists and will be reused.",
+                #     )
+                #     torch.cuda.tunable.read_file(tunableop_filepath)
 
                 os.makedirs(HUGGINGFACE_HUB_CACHE, exist_ok=True)
 
-                for seqlen in tuning_sequences:
-                    logger.info(f"Warming up TunableOp for seqlen={seqlen}")
-                    self.tunableop_warmup(seqlen)
-                    torch.cuda.tunable.write_file(tunableop_filepath)
-                torch.cuda.tunable.tuning_enable(False)
+                # for seqlen in tuning_sequences:
+                #     log_master(logger.info, f"Warming up TunableOp for seqlen={seqlen}")
+                #     self.tunableop_warmup(seqlen)
+                #     torch.cuda.tunable.write_file(tunableop_filepath)
+                # if os.environ.get("PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP") != "1":
+                #     torch.cuda.tunable.tuning_enable(False)
             else:
-                logger.info(
-                    "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp."
+                log_master(
+                    logger.info,
+                    "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp.",
                 )
 
         if CUDA_GRAPHS:
             try:
-                logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}")
+                log_master(
+                    logger.info, f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}"
+                )
                 # Warmup cuda graphs
                 for bs in CUDA_GRAPHS:
                     if self.speculate is None or self.speculate + 1 <= bs:
                         self.cuda_graph_warmup(bs, max_s, max_bt)
             except torch.cuda.OutOfMemoryError:
-                logger.exception(f"Decode cuda graph warmup failed")
+                logger.exception("Decode cuda graph warmup failed")
         else:
-            logger.info(f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS}).")
+            log_master(
+                logger.info, f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS})."
+            )
 
         return int(num_blocks * BLOCK_SIZE)
 
@@ -1089,20 +1634,31 @@ class FlashCausalLM(Model):
 
         # Dummy value, some models (starcoder2) don't accept `None`.
         input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device)
-        input_lengths = Seqlen(input_lengths=input_lengths)
+        cache_lengths_tensor = torch.zeros(
+            seqlen, dtype=torch.int32, device=self.device
+        )
+        cu_seqlen_prefill = torch.tensor(
+            [0, seqlen], device=self.device, dtype=torch.int32
+        )
+        max_s = seqlen
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            cache_lengths=cache_lengths_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+            max_q=1,
+            max_k=seqlen,
+        )
 
         # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
         self.model.forward(
             input_ids=input_ids,
             position_ids=position_ids,
-            cu_seqlen_prefill=torch.tensor(
-                [0, seqlen], device=self.device, dtype=torch.int32
-            ),
+            cu_seqlen_prefill=cu_seqlen_prefill,
             kv_cache=self.kv_cache,
             block_tables=None,
-            input_lengths=input_lengths,
+            seqlen=seqlen,
             slots=slots,
-            max_s=seqlen,
+            max_s=max_s,
             lm_head_indices=None,
             prefill_cache_indices=None,
         )
@@ -1119,7 +1675,7 @@ class FlashCausalLM(Model):
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
             speculative_ids = batch.speculative_ids
@@ -1138,6 +1694,9 @@ class FlashCausalLM(Model):
             input_lengths = (
                 input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
             ).view(-1)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
+            ).reshape(-1)
 
             # Add Copy the block tables for all members
             block_tables = (
@@ -1158,7 +1717,8 @@ class FlashCausalLM(Model):
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
         if cu_seqlen_prefill is None and self.max_past() is not None:
@@ -1176,38 +1736,86 @@ class FlashCausalLM(Model):
             cuda_graph = None
 
         if cu_seqlen_prefill is not None or cuda_graph is None:
-            input_lengths = Seqlen(input_lengths=input_lengths)
-            logits, speculative_logits = self.model.forward(
-                input_ids=input_ids,
-                position_ids=position_ids,
-                cu_seqlen_prefill=cu_seqlen_prefill,
-                kv_cache=kv_cache,
+            if ATTENTION == "flashinfer":
+                block_tables = block_tables_to_ragged(
+                    block_tables=block_tables,
+                    input_lengths=batch.input_lengths,
+                    cache_lengths=batch.cache_lengths,
+                    input_lengths_tensor=batch.input_lengths_tensor,
+                    cache_lengths_tensor=batch.cache_lengths_tensor,
+                    max_current_length=batch.max_current_length,
+                )
+            with self._forward_context(
                 block_tables=block_tables,
-                slots=slots,
-                input_lengths=input_lengths,
-                max_s=max_s,
-                prefill_cache_indices=batch.prefill_cache_indices,
-                lm_head_indices=lm_head_indices,
-                adapter_data=adapter_data,
-            )
-            if batch.prefill_cache_indices is not None:
-                batch.prefill_cache_indices = None
-            return logits, speculative_logits
+                cu_seqlen_prefill=cu_seqlen_prefill,
+                input_lengths_tensor=input_lengths,
+                cache_lengths_tensor=cache_lengths_tensor,
+            ):
+                seqlen = Seqlen(
+                    input_lengths=input_lengths,
+                    cache_lengths=cache_lengths_tensor,
+                    cu_seqlen_q=cu_seqlen_prefill,
+                    max_q=batch.max_input_length,
+                    max_k=batch.max_current_length,
+                )
+                logits, speculative_logits = self.model.forward(
+                    input_ids=input_ids,
+                    position_ids=position_ids,
+                    cu_seqlen_prefill=cu_seqlen_prefill,
+                    kv_cache=kv_cache,
+                    block_tables=block_tables,
+                    slots=slots,
+                    seqlen=seqlen,
+                    max_s=max_s,
+                    prefill_cache_indices=batch.prefill_cache_indices,
+                    lm_head_indices=lm_head_indices,
+                    adapter_data=adapter_data,
+                )
+                if batch.prefill_cache_indices is not None:
+                    batch.prefill_cache_indices = None
+                return logits, speculative_logits
 
         # Copy inputs to the static inputs of the cuda graph
         # Static inputs are potentially padded
         cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
         cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
-        cuda_graph["block_tables"][
-            : block_tables.shape[0], : block_tables.shape[1]
-        ] = block_tables
-        cuda_graph["slots"].fill_(-1)
+        if ATTENTION == "flashinfer":
+            block_tables = block_tables_to_ragged(
+                block_tables=block_tables,
+                input_lengths=batch.input_lengths,
+                cache_lengths=batch.cache_lengths,
+                input_lengths_tensor=batch.input_lengths_tensor,
+                cache_lengths_tensor=batch.cache_lengths_tensor,
+                max_current_length=batch.max_current_length,
+            )
+            # assert block_tables.shape[0] >= slots.shape[0]
+            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
+        else:
+            cuda_graph["block_tables"][
+                : block_tables.shape[0], : block_tables.shape[1]
+            ] = block_tables
+
+        # XXX: This is working only because block 0 is reserved for the healthcheck
+        # so it doesn't matter if we override it with bogus values.
+        cuda_graph["slots"].fill_(0)
         cuda_graph["slots"][: slots.shape[0]] = slots
         cuda_graph["input_lengths"].zero_()
         cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
+        cuda_graph["cache_lengths"].zero_()
+        cuda_graph["cache_lengths"][
+            : cache_lengths_tensor.shape[0]
+        ] = cache_lengths_tensor
+
+        with self._forward_context(
+            block_tables=cuda_graph["block_tables"],
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=cuda_graph["input_lengths"],
+            cache_lengths_tensor=cuda_graph["cache_lengths"],
+            state=cuda_graph["state"],
+        ):
+            # Replay the graph
+            cuda_graph["graph"].replay()
 
-        # Replay the graph
-        cuda_graph["graph"].replay()
         # Slice output to the correct shape
         speculative_logits = (
             cuda_graph["speculative_logits"][:bs]
@@ -1222,7 +1830,10 @@ class FlashCausalLM(Model):
         self, batch: FlashCausalLMBatch
     ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch], Tuple[int, int]]:
         start = time.time_ns()
-        prefill = batch.cu_seqlen_prefill is not None
+        prefill = batch.prefilling
+        if prefill:
+            batch.prepare_for_prefill()
+
         prefill_logprobs = batch.prefill_next_token_indices is not None
 
         # Update adapter indices for speculative tokens (if present)
@@ -1264,13 +1875,60 @@ class FlashCausalLM(Model):
                     if prefill_logprobs
                     else speculative_logits
                 )
-            next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty(
-                len(batch)
-            )
-
+            if len(batch) > 1 and prefill_logprobs:
+                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
+                # When batch == 1, we will just use the batch.input_ids values directly
+                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
         else:
+            prefill_logprobs = None
             next_token_logits = out
-            next_adapter_indices = batch.adapter_meta.adapter_indices
+
+        finished_prefilling = True
+        next_chunk_lengths = []
+        current_prefilling_mask = batch.prefilling_mask
+        if prefill:
+            if get_support_chunking():
+                next_prefilling_mask = []
+                # Budget in tokens for the next batch
+                # We remove (len(batch) - 1) to always have enough space for at least a single decode
+                # for the remaining requests -1 because the first request does not need to be removed from the budget
+                # (ex: you have one request in the batch, you want it to take the full budget not budget -1)
+                batch_budget = get_max_prefill_tokens() - (len(batch) - 1)
+                # We reverse to prioritize older requests
+                # zip() is not reversible so reverse the underlying lists instead
+                for cache_length, input_length, prompt_length in zip(
+                    reversed(batch.cache_lengths),
+                    reversed(batch.input_lengths),
+                    reversed(batch.prompt_lengths),
+                ):
+                    remaining_prefill_tokens = max(
+                        prompt_length - cache_length - input_length, 0
+                    )
+                    if remaining_prefill_tokens > 0:
+                        next_chunk_length = max(
+                            min(remaining_prefill_tokens, batch_budget), 1
+                        )
+                        batch_budget -= next_chunk_length
+                        finished_prefilling = False
+                        next_prefilling_mask.append(True)
+                    else:
+                        # FIXME: use true number of accepted tokens instead of 1
+                        # Since speculation will be turned off, this is always true
+                        next_chunk_length = 1
+                        next_prefilling_mask.append(False)
+                    next_chunk_lengths.append(next_chunk_length)
+
+                # Reverse back the obtained values²
+                next_chunk_lengths.reverse()
+                next_prefilling_mask.reverse()
+            else:
+                # The model does not support chunking
+                # We know we only do a single prefill
+                finished_prefilling = True
+                next_prefilling_mask = [False] * len(batch)
+
+            batch.prefilling = not finished_prefilling
+            batch.prefilling_mask = next_prefilling_mask
 
         speculate = get_speculate()
         (
@@ -1280,7 +1938,7 @@ class FlashCausalLM(Model):
             accepted_ids,
             speculative_ids,
         ) = batch.next_token_chooser(
-            batch.all_input_ids_tensor[:, : batch.max_seqlen],
+            batch.all_input_ids_tensor[:, : batch.max_current_length],
             next_token_logits,
             speculate,
             batch.speculative_ids,
@@ -1291,85 +1949,112 @@ class FlashCausalLM(Model):
             batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs, accepted_ids
         )
 
-        if prefill:
-            if len(batch) > 1 and prefill_logprobs:
-                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
-                # When batch == 1, we will just use the batch.input_ids values directly
-                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
-
-            next_position_ids = batch.position_ids.new_empty(len(batch))
-            batch.slot_indices = batch.slot_indices[batch.cu_seqlen_prefill[1:] - 1]
-            # We do not need cu_seqlen_prefill anymore
-            batch.cu_seqlen_prefill = None
-        else:
-            prefill_logprobs = None
-            next_position_ids = batch.position_ids
-
-        # Cumulative length
-        cumulative_length = 0
-
-        # Results
-        generations: List[Generation] = []
-        stopped = True
+        # Since we are done prefilling, all the tensors that were concatenating values for all the requests
+        # instantly become of shape [BATCH_SIZE]
+        if prefill and finished_prefilling:
+            indices = batch.cu_seqlen_prefill[1:] - 1
+            batch.position_ids = batch.position_ids[indices]
+            batch.slot_indices = batch.slot_indices[indices]
+            batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
+                indices
+            ]
 
         # Zipped iterator
-        iterator = zip(batch.input_lengths, batch.all_input_ids, accepted_ids)
+        iterator = zip(
+            batch.requests,
+            batch.prompt_lengths,
+            batch.cache_lengths,
+            batch.input_lengths,
+            batch.all_input_ids,
+            accepted_ids,
+            current_prefilling_mask,
+            batch.prefilling_mask,
+        )
 
         # We do two for loops as the first one can run completely asynchronously from the GPU while for the second
         # one, we need to first do a GPU <-> CPU sync
         # It is faster if we delay this sync for the maximum amount of time
 
         # For each member of the batch
-        index = 0
-        for i, (input_length, all_input_ids, n_accepted_ids) in enumerate(iterator):
-            # Indexing metadata
-            start_index = cumulative_length
-            end_index = cumulative_length + input_length
-
-            if prefill:
+        # Cumulative length
+        cu_accepted_ids = torch.nn.functional.pad(
+            torch.cumsum(accepted_ids, dim=0), (1, 0)
+        )
+        cumulative_length = 0
+        for i, (
+            request,
+            prompt_length,
+            cache_length,
+            input_length,
+            all_input_ids,
+            n_accepted_ids,
+            request_was_prefilling,
+            request_is_prefilling,
+        ) in enumerate(iterator):
+            # Used to gather prefill logprobs
+            # Copy batch.all_input_ids_tensor to prefill_token_indices
+            if request.prefill_logprobs and request_was_prefilling:
                 # Indexing metadata
                 out_start_index = batch.prefill_cu_outlens[i]
                 out_end_index = batch.prefill_cu_outlens[i + 1]
-                out_length = out_end_index - out_start_index
 
-                # Initialize position_ids
-                # In decode, we do not need this as we can just increment position ids
-                next_position_ids[i] = batch.position_ids[end_index - 1]
-
-                # Initialize adapter indices
-                # In decode, we only have one token per row in the batch, so grab last index
-                next_adapter_indices[i] = batch.adapter_meta.adapter_indices[
-                    end_index - 1
+                # Logprobs generated by the model are for the next token
+                # So we need to translate the id tensor by 1
+                ids = batch.all_input_ids_tensor[
+                    i, cache_length + 1 : cache_length + input_length + 1
                 ]
-
-                # Used to gather prefill logprobs
-                # Copy batch.input_ids to prefill_token_indices
-                if prefill_logprobs:
-                    if len(batch) > 1:
-                        prefill_tokens_indices[out_start_index : out_end_index - 1] = (
-                            batch.input_ids[start_index + 1 : start_index + out_length]
-                        )
-                    else:
-                        # Set prefill_tokens_indices to the correct slice
-                        prefill_tokens_indices = batch.input_ids[
-                            start_index + 1 : start_index + out_length
-                        ]
-
-            for j in range(n_accepted_ids):
-                batch.all_input_ids_tensor[i, input_length + j] = next_input_ids[index]
-                index += 1
-
+                if len(batch) > 1:
+                    prefill_tokens_indices[out_start_index:out_end_index] = ids
+                else:
+                    # Set prefill_tokens_indices to the correct slice
+                    prefill_tokens_indices = ids
+
+            # If the device does not support triton, we copy one by one
+            if not request_is_prefilling and not has_triton():
+                # Only save tokens if we are done prefilling for this request
+                batch.all_input_ids_tensor[
+                    i,
+                    batch.cache_lengths_tensor[i]
+                    + batch.input_lengths[i] : batch.cache_lengths_tensor[i]
+                    + batch.input_lengths[i]
+                    + accepted_ids[i],
+                ] = next_input_ids[cu_accepted_ids[i] : cu_accepted_ids[i + 1]]
             cumulative_length += input_length
 
+        # If the device support triton, we can use a fused kernel
+        if has_triton():
+            copy_next_input_ids_inplace(
+                speculate + 1,
+                batch.all_input_ids_tensor,
+                batch.cache_lengths_tensor,
+                batch.input_lengths_tensor,
+                batch.prompt_lengths_tensor,
+                next_input_ids,
+                cu_accepted_ids,
+            )
+
         # Update values
-        batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1]
-        batch.speculative_ids = speculative_ids
-        batch.position_ids = next_position_ids + accepted_ids
-        batch.input_lengths_tensor += accepted_ids
-        batch.slot_indices += accepted_ids
-        batch.adapter_meta.adapter_indices = next_adapter_indices
+        # These values can be updated without a GPU -> CPU sync
+        if not prefill or (prefill and finished_prefilling):
+            batch.input_ids = next_input_ids[cu_accepted_ids[1:] - 1]
+            batch.speculative_ids = speculative_ids
+            batch.position_ids += accepted_ids
+            batch.cache_lengths_tensor += batch.input_lengths_tensor + accepted_ids - 1
+            batch.input_lengths_tensor = torch.ones_like(batch.input_lengths_tensor)
+            batch.slot_indices += accepted_ids
 
-        if prefill:
+        if prefill and prefill_logprobs:
+            # Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
+            torch.log_softmax(out, -1, out=out)
+            prefill_logprobs_tensor = out
+            prefill_logprobs = torch.gather(
+                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
+            )
+            # GPU <-> CPU sync
+            prefill_logprobs = prefill_logprobs.view(-1).tolist()
+
+        # Does a GPU <-> CPU sync internally
+        if prefill and finished_prefilling:
             # adjust segment lengths to account for all request lengths being 1 during decoding
             adapter_segments, _ = find_segments(batch.adapter_meta.adapter_indices)
             batch.adapter_meta.adapter_segments = torch.tensor(
@@ -1378,24 +2063,62 @@ class FlashCausalLM(Model):
                 device=batch.adapter_meta.adapter_segments.device,
             )
 
-        if prefill and prefill_logprobs:
-            # Get prefill logprobs
-            prefill_logprobs_tensor = torch.log_softmax(out, -1)
-            prefill_logprobs = torch.gather(
-                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
-            )
-            # GPU <-> CPU sync
-            prefill_logprobs = prefill_logprobs.view(-1).tolist()
-
         # GPU <-> CPU sync
         next_token_logprobs = next_token_logprobs.tolist()
         next_token_ids = next_input_ids.tolist()
         accepted_ids = accepted_ids.tolist()
+
+        # Update values if we need to continue prefilling
+        # This represents the `else` case of the `Update values` if above
+        # but since this require the `next_token_ids` to be on CPU, it is better to do it here
+        if prefill and not finished_prefilling:
+            # Speculation must be ignored while we prefill even with chunking
+            # it simplifies everything
+            assert batch.speculative_ids is None
+
+            all_postfix_ids = []
+            for i, (
+                request_prefilling,
+                next_token_id,
+                all_input_ids,
+                cache_length,
+                input_length,
+                next_chunk_length,
+            ) in enumerate(
+                zip(
+                    batch.prefilling_mask,
+                    next_token_ids,
+                    batch.all_input_ids,
+                    batch.cache_lengths,
+                    batch.input_lengths,
+                    next_chunk_lengths,
+                )
+            ):
+                if request_prefilling:
+                    next_cache_length = cache_length + input_length
+                    # Get new prompt IDs to prefill
+                    postfix_ids = all_input_ids[
+                        next_cache_length : next_cache_length + next_chunk_length
+                    ]
+                else:
+                    # This request is done prefilling, the new id is the one selected the sampling method
+                    postfix_ids = [next_token_id]
+
+                all_postfix_ids.append(postfix_ids)
+
+            batch.input_ids = all_postfix_ids
+
         start_decode = time.time_ns()
 
+        # Results
+        generations: List[Generation] = []
+        stopped = True
+
         # Zipped iterator
         iterator = zip(
             batch.requests,
+            batch.prompt_lengths,
+            batch.cache_lengths,
             batch.input_lengths,
             batch.prefix_offsets,
             batch.read_offsets,
@@ -1404,15 +2127,21 @@ class FlashCausalLM(Model):
             batch.next_token_chooser.do_sample,
             batch.next_token_chooser.seeds,
             batch.top_n_tokens,
+            current_prefilling_mask,
+            batch.prefilling_mask,
             accepted_ids,
             batch_top_token_ids,
             batch_top_token_logprobs,
         )
 
+        # Reset max_input_length
+        batch.max_input_length = 0
         # For each member of the batch
         index = 0
         for i, (
             request,
+            prompt_length,
+            cache_length,
             input_length,
             prefix_offset,
             read_offset,
@@ -1421,148 +2150,195 @@ class FlashCausalLM(Model):
             do_sample,
             seed,
             top_n_tokens,
+            request_was_prefilling,
+            request_is_prefilling,
             n_accepted_ids,
             top_token_ids,
             top_token_logprobs,
         ) in enumerate(iterator):
-            # Append next token to all tokens
-            next_token_texts = []
-            left = 0
-
-            if n_accepted_ids > 1:
-                if RANK == 0:
-                    logger.debug(f"Speculated ids {n_accepted_ids - 1}")
-
-            current_stopped = False
-            for j in range(index, index + n_accepted_ids):
-                # Generated token
-                next_token_id = next_token_ids[j]
-                all_input_ids.append(next_token_id)
-                next_token_text, prefix_offset, read_offset = self.decode_token(
-                    all_input_ids,
-                    prefix_offset,
-                    read_offset,
-                )
-                next_token_texts.append(next_token_text)
-
-                stop, reason = stopping_criteria(
-                    next_token_id,
-                    next_token_text,
-                )
-
-                if stop:
-                    left = index + n_accepted_ids - j - 1
-                    current_stopped = True
-                    break
-                else:
-                    current_stopped = False
-            stopped = stopped and current_stopped
-
-            _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
-            _next_token_logprobs = next_token_logprobs[
-                index : index + n_accepted_ids - left
-            ]
-            index += n_accepted_ids
-
-            # Shard generations
-            # All generations will be appended in the rust sharded client
-            if i % self.world_size == self.rank:
-                if stop:
-                    # Decode generated tokens
-                    output_text, _, _ = self.decode_token(
-                        all_input_ids,
-                        prefix_offset=len(all_input_ids)
-                        - stopping_criteria.current_tokens
-                        - 1,
-                        read_offset=len(all_input_ids)
-                        - stopping_criteria.current_tokens,
-                        skip_special_tokens=True,
-                    )
-                    generated_text = GeneratedText(
-                        output_text,
-                        stopping_criteria.current_tokens,
-                        reason,
-                        seed if do_sample else None,
-                    )
-                else:
-                    generated_text = None
-
+            # Compute logprobs first as, even though we might skip the token,
+            # it can still be required to compute the logprobs
+            # modulo on request.id as it is robust to batch.filter whereas the index in the batch is not and we need
+            # this state to be stable
+            if request.id % self.world_size == self.rank:
                 # Prefill
-                if prefill and request.prefill_logprobs:
+                if request_was_prefilling and request.prefill_logprobs:
                     out_start_index = batch.prefill_cu_outlens[i]
                     out_end_index = batch.prefill_cu_outlens[i + 1]
-
-                    # Remove generated token to only have prefill and add nan for first prompt token
-                    request_prefill_logprobs = [float("nan")] + prefill_logprobs[
-                        out_start_index : out_end_index - 1
+                    if not request_is_prefilling:
+                        # The request is dones prefilling, meaning that we started generating new tokens
+                        # The last logprob is a logprob for a generated token that was not part of the prompt
+                        # We need to remove it
+                        out_end_index -= 1
+
+                    request_prefill_logprobs = prefill_logprobs[
+                        out_start_index:out_end_index
+                    ]
+                    # Logprobs generated by the model are for the next token
+                    # So we need to translate the id tensor by 1
+                    prefill_token_ids = all_input_ids[
+                        cache_length + 1 : cache_length + input_length + 1
                     ]
-                    prefill_token_ids = all_input_ids[:-1]
+
+                    past_prefill_logprob_tokens = batch.prefill_logprob_tokens[i]
+
+                    if past_prefill_logprob_tokens is None:
+                        # add nan for cached prompt tokens/first token
+                        request_prefill_logprobs = [float("nan")] * (
+                            cache_length + 1
+                        ) + request_prefill_logprobs
+                        prefill_token_ids = (
+                            all_input_ids[: cache_length + 1] + prefill_token_ids
+                        )
+
                     prefill_texts = self.tokenizer.batch_decode(
                         prefill_token_ids,
                         clean_up_tokenization_spaces=False,
                         skip_special_tokens=False,
                     )
 
-                    prefill_tokens = Tokens(
+                    prefill_logprob_tokens = Tokens(
                         prefill_token_ids,
                         request_prefill_logprobs,
                         prefill_texts,
                         is_special=[],
                     )
+                    if past_prefill_logprob_tokens is not None:
+                        prefill_logprob_tokens = (
+                            past_prefill_logprob_tokens + prefill_logprob_tokens
+                        )
+
+                    batch.prefill_logprob_tokens[i] = prefill_logprob_tokens
                 else:
-                    prefill_tokens = None
-
-                if top_n_tokens > 0:
-                    all_top_tokens = []
-                    for top_token_ids, top_token_logprobs in zip(
-                        top_token_ids, top_token_logprobs
-                    ):
-                        toptoken_texts = self.tokenizer.batch_decode(
-                            top_token_ids,
-                            clean_up_tokenization_spaces=False,
-                            skip_special_tokens=False,
+                    batch.prefill_logprob_tokens[i] = None
+
+            # If it is, the tokens we decoded should be ignored
+            if request_is_prefilling:
+                # Make sure that we do not stop as even though this request did not create a token, it is still
+                # processing
+                stopped = False
+                new_input_length = next_chunk_lengths[i]
+                new_cache_length = cache_length + input_length
+            else:
+                new_input_length = 1
+                new_cache_length = cache_length + input_length + n_accepted_ids - 1
+                # Append next token to all tokens
+                next_token_texts = []
+                left = 0
+
+                if n_accepted_ids > 1:
+                    log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")
+
+                current_stopped = False
+                for j in range(index, index + n_accepted_ids):
+                    # Generated token
+                    next_token_id = next_token_ids[j]
+                    all_input_ids.append(next_token_id)
+                    next_token_text, prefix_offset, read_offset = self.decode_token(
+                        all_input_ids,
+                        prefix_offset,
+                        read_offset,
+                    )
+                    next_token_texts.append(next_token_text)
+
+                    stop, reason = stopping_criteria(
+                        next_token_id,
+                        next_token_text,
+                    )
+
+                    if stop:
+                        left = index + n_accepted_ids - j - 1
+                        current_stopped = True
+                        break
+                    else:
+                        current_stopped = False
+                stopped = stopped and current_stopped
+
+                _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
+                _next_token_logprobs = next_token_logprobs[
+                    index : index + n_accepted_ids - left
+                ]
+
+                # Shard generations
+                # All generations will be appended in the rust sharded client
+                if request.id % self.world_size == self.rank:
+                    if stop:
+                        # Decode generated tokens
+                        output_text, _, _ = self.decode_token(
+                            all_input_ids,
+                            prefix_offset=len(all_input_ids)
+                            - stopping_criteria.current_tokens
+                            - 1,
+                            read_offset=len(all_input_ids)
+                            - stopping_criteria.current_tokens,
+                            skip_special_tokens=True,
                         )
-                        special_toptokens = [
-                            token_id in self.all_special_ids
-                            for token_id in top_token_ids
-                        ]
-                        top_tokens = Tokens(
-                            top_token_ids,
-                            top_token_logprobs,
-                            toptoken_texts,
-                            special_toptokens,
+                        generated_text = GeneratedText(
+                            output_text,
+                            stopping_criteria.current_tokens,
+                            reason,
+                            seed if do_sample else None,
                         )
-                        all_top_tokens.append(top_tokens)
-                    top_tokens = all_top_tokens
-                else:
-                    top_tokens = None
-
-                generation = Generation(
-                    request.id,
-                    prefill_tokens,
-                    Tokens(
-                        _next_token_ids,
-                        _next_token_logprobs,
-                        next_token_texts,
-                        [nid in self.all_special_ids for nid in _next_token_ids],
-                    ),
-                    generated_text,
-                    top_tokens,
-                )
+                    else:
+                        generated_text = None
+
+                    if top_n_tokens > 0:
+                        all_top_tokens = []
+                        for top_token_ids, top_token_logprobs in zip(
+                            top_token_ids, top_token_logprobs
+                        ):
+                            toptoken_texts = self.tokenizer.batch_decode(
+                                top_token_ids,
+                                clean_up_tokenization_spaces=False,
+                                skip_special_tokens=False,
+                            )
+                            special_toptokens = [
+                                token_id in self.all_special_ids
+                                for token_id in top_token_ids
+                            ]
+                            top_tokens = Tokens(
+                                top_token_ids,
+                                top_token_logprobs,
+                                toptoken_texts,
+                                special_toptokens,
+                            )
+                            all_top_tokens.append(top_tokens)
+                        top_tokens = all_top_tokens
+                    else:
+                        top_tokens = None
+
+                    generation = Generation(
+                        request.id,
+                        batch.prefill_logprob_tokens[i],
+                        Tokens(
+                            _next_token_ids,
+                            _next_token_logprobs,
+                            next_token_texts,
+                            [nid in self.all_special_ids for nid in _next_token_ids],
+                        ),
+                        generated_text,
+                        top_tokens,
+                    )
 
-                generations.append(generation)
+                    generations.append(generation)
 
-            # accept each new token for this specific request since we may
-            # have more than one new token per request with speculative decoding
-            for next_token_id in _next_token_ids:
-                batch.next_token_chooser = (
-                    batch.next_token_chooser.advance_grammar_single(i, next_token_id)
-                )
+                # accept each new token for this specific request since we may
+                # have more than one new token per request with speculative decoding
+                for next_token_id in _next_token_ids:
+                    batch.next_token_chooser = (
+                        batch.next_token_chooser.advance_grammar_single(
+                            i, next_token_id
+                        )
+                    )
 
             # Update values
-            batch.input_lengths[i] = input_length + n_accepted_ids
-            if batch.input_lengths[i] > batch.max_seqlen:
-                batch.max_seqlen = batch.input_lengths[i]
+            index += n_accepted_ids
+            batch.cache_lengths[i] = new_cache_length
+            batch.max_input_length = max(batch.max_input_length, new_input_length)
+            batch.input_lengths[i] = new_input_length
+            current_length = new_cache_length + new_input_length
+            batch.max_current_length = max(batch.max_current_length, current_length)
+
             batch.prefix_offsets[i] = prefix_offset
             batch.read_offsets[i] = read_offset
             batch.all_input_ids[i] = all_input_ids
@@ -1573,10 +2349,61 @@ class FlashCausalLM(Model):
             decode_ns = time.time_ns() - start_decode
             return generations, None, (forward_ns, decode_ns)
 
-        batch.prefill_cu_outlens = None
-        batch.prefill_head_indices = None
-        batch.prefill_next_token_indices = None
+        if prefill and finished_prefilling:
+            # We do not need prefill tensors anymore
+            batch.cu_seqlen_prefill = None
+            batch.prefill_cache_indices = None
+            batch.prefill_cu_outlens = None
+            batch.prefill_head_indices = None
+            batch.prefill_next_token_indices = None
 
         forward_ns = start_decode - start
         decode_ns = time.time_ns() - start_decode
         return generations, batch, (forward_ns, decode_ns)
+
+    def _forward_context(
+        self,
+        *,
+        block_tables: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        input_lengths_tensor: torch.Tensor,
+        cache_lengths_tensor: torch.Tensor,
+        state: Optional[Any] = None,
+    ) -> ContextManager:
+        if ATTENTION != "flashinfer":
+            return nullcontext()
+
+        from text_generation_server.layers.attention.flashinfer import (
+            use_decode_state,
+            use_prefill_with_paged_kv_state,
+        )
+
+        if cu_seqlen_prefill is not None:
+            return use_prefill_with_paged_kv_state(
+                state=(
+                    state if state is not None else self.prefill_with_paged_kv_state
+                ),
+                block_tables=block_tables,
+                cu_seqlens=cu_seqlen_prefill,
+                input_lengths=input_lengths_tensor + cache_lengths_tensor,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                page_size=BLOCK_SIZE,
+                dtype=self.dtype,
+                window_left=self.sliding_window,
+            )
+        else:
+            assert input_lengths_tensor is not None
+            return use_decode_state(
+                state=state if state is not None else self.decode_state,
+                input_lengths=input_lengths_tensor + cache_lengths_tensor,
+                block_tables=block_tables,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                page_size=BLOCK_SIZE,
+                kv_cache_dtype=self.kv_cache_dtype,
+                dtype=self.dtype,
+                window_left=self.sliding_window,
+            )
diff --git a/server/text_generation_server/models/flash_cohere.py b/server/text_generation_server/models/flash_cohere.py
deleted file mode 100644
index 9f8bcb3fbc8f0e7c53ce75acc5d63cda6d0b572a..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_cohere.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from typing import Optional
-from transformers import AutoTokenizer, AutoConfig
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
-    FlashCohereForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashCohere(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashCohere is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-            use_fast=True,
-            from_slow=False,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashCohereForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashCohere, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_dbrx.py b/server/text_generation_server/models/flash_dbrx.py
deleted file mode 100644
index 2aba6a002bf0cdbbf41675943097f703f43d4acf..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_dbrx.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from typing import Optional
-from transformers import AutoTokenizer
-from transformers.models.gpt2 import GPT2TokenizerFast
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
-    FlashDbrxForCausalLM,
-    DbrxConfig,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashDbrx(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashDBRX is only available on GPU")
-
-        try:
-            tokenizer = GPT2TokenizerFast.from_pretrained(
-                model_id,
-                revision=revision,
-                padding_side="left",
-                truncation_side="left",
-                trust_remote_code=trust_remote_code,
-                use_fast=True,
-                from_slow=False,
-            )
-        except:
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(
-                    model_id,
-                    revision=revision,
-                    padding_side="left",
-                    truncation_side="left",
-                    trust_remote_code=trust_remote_code,
-                    use_fast=True,
-                    from_slow=False,
-                )
-            except:
-                # FIXME: change back to model id once the tokenizer.json is merged
-                tokenizer = GPT2TokenizerFast.from_pretrained(
-                    "Xenova/dbrx-instruct-tokenizer",
-                    revision=revision,
-                    padding_side="left",
-                    truncation_side="left",
-                    trust_remote_code=trust_remote_code,
-                    use_fast=True,
-                    from_slow=False,
-                )
-
-        config = DbrxConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashDbrxForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashDbrx, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_gemma.py b/server/text_generation_server/models/flash_gemma.py
deleted file mode 100644
index 7e2b8780bdd755e90563fa43af56bcc284c499d0..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_gemma.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from typing import Optional
-from transformers import AutoConfig, AutoTokenizer
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
-    FlashGemmaForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashGemma(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashGemma is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        # TODO hardcoded
-        prefix = ""
-        model = FlashGemmaForCausalLM(prefix, config, weights, causal=True)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashGemma, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_gemma2.py b/server/text_generation_server/models/flash_gemma2.py
deleted file mode 100644
index 86cfc7e2bcab92a84883ae73005521556ecc2b2d..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_gemma2.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from typing import Optional
-from transformers import PretrainedConfig, AutoTokenizer
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
-    FlashGemma2ForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashGemma2(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashGemma2 is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = PretrainedConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        # TODO hardcoded
-        prefix = ""
-        model = FlashGemma2ForCausalLM(prefix, config, weights, causal=True)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashGemma2, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_gpt2.py b/server/text_generation_server/models/flash_gpt2.py
deleted file mode 100644
index 323fcafa8aee6613ff73960db4fa9385cb1b6b23..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_gpt2.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig
-from transformers.models.gpt2 import GPT2Tokenizer
-from typing import Optional
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
-    FlashGPT2ForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashGPT2(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashGPT2 is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        prefix = ""
-        model = FlashGPT2ForCausalLM(prefix, config, weights)
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashGPT2, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
deleted file mode 100644
index d996b9c3a6e236804a01d8e19c4c5b1e7a4eb603..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_llama.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import os
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig
-from typing import Optional, Tuple, Dict, List
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_llama_modeling import (
-    FlashLlamaForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-    hub,
-)
-
-tracer = trace.get_tracer(__name__)
-
-from text_generation_server.utils.import_utils import SYSTEM
-
-ADAPTER_LAYERS = [
-    "q_proj",
-    "k_proj",
-    "v_proj",
-    "o_proj",
-    "gate_proj",
-    "up_proj",
-    "down_proj",
-]
-ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}
-
-
-class FlashLlama(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-        lora_adapter_ids: Optional[list] = [],
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashLlama is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        try:
-            generation_config = GenerationConfig.from_pretrained(
-                model_id, revision=revision, trust_remote_code=trust_remote_code
-            )
-            if isinstance(generation_config.eos_token_id, (list, set)):
-                # TODO Huge hack
-                tokenizer._eos_token_ids = set(generation_config.eos_token_id)
-        except Exception:
-            pass
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        prefix = ""
-        model = FlashLlamaForCausalLM(prefix, config, weights)
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashLlama, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    @property
-    def supports_adapter_loading(self) -> bool:
-        return True
-
-    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
-        layer_weights = {}
-
-        prefix = "model.layers"
-
-        # This accounts for VLMs (e.g. LlavaNext, Idefics2)
-        # that have a language_model inside of the larger model.
-        if hasattr(self.model, "language_model"):
-            _model = self.model.language_model
-        elif hasattr(self.model, "text_model"):
-            _model = self.model.text_model
-        else:
-            _model = self.model
-
-        for i, layer in enumerate(_model.model.layers):
-            layer_weights[(i, "q_proj")] = (
-                f"{prefix}.{i}.self_attn.q_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "k_proj")] = (
-                f"{prefix}.{i}.self_attn.k_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "v_proj")] = (
-                f"{prefix}.{i}.self_attn.v_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "o_proj")] = (
-                f"{prefix}.{i}.self_attn.o_proj",
-                layer.self_attn.o_proj,
-            )
-
-            layer_weights[(i, "gate_proj")] = (
-                f"{prefix}.{i}.mlp.gate_proj",
-                layer.mlp.gate_up_proj,
-            )
-            layer_weights[(i, "up_proj")] = (
-                f"{prefix}.{i}.mlp.up_proj",
-                layer.mlp.gate_up_proj,
-            )
-            layer_weights[(i, "down_proj")] = (
-                f"{prefix}.{i}.mlp.down_proj",
-                layer.mlp.down_proj,
-            )
-
-        layer_weights[(0, "lm_head")] = ("lm_head", _model.lm_head)
-        return layer_weights
-
-    @property
-    def adapter_layers(self) -> List[str]:
-        return ADAPTER_LAYERS
-
-    @property
-    def default_traced_adapter_layers(self) -> List[str]:
-        return ["q_proj", "v_proj"]
-
-    def get_num_layers_for_type(self, layer_type: str) -> int:
-        return 1 if layer_type == "lm_head" else len(self.model.model.layers)
-
-    def is_row_parallel(self, layer_type: str) -> bool:
-        return layer_type in ROW_PARALLEL
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
deleted file mode 100644
index 0f5746debeb781b9a5dace6797fc1a24c8eb6ec7..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_mistral.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoTokenizer, AutoConfig
-from typing import Optional, Tuple, Dict, List
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.flash_causal_lm import set_sliding_window
-from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
-    FlashMistralForCausalLM,
-    MistralConfig,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-ADAPTER_LAYERS = [
-    "q_proj",
-    "k_proj",
-    "v_proj",
-    "o_proj",
-    "gate_proj",
-    "up_proj",
-    "down_proj",
-]
-ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}
-
-
-class BaseFlashMistral(FlashCausalLM):
-    def __init__(
-        self,
-        model_cls,
-        model_id: str,
-        config_cls=AutoConfig,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-        tokenizer_class=AutoTokenizer,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashMistral is only available on GPU")
-
-        tokenizer = tokenizer_class.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = config_cls.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        # Set context windows
-        if getattr(config, "sliding_window", None) is not None:
-            set_sliding_window(config.sliding_window)
-        else:
-            config.sliding_window = None
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        prefix = ""
-        model = model_cls(prefix, config, weights)
-
-        self.cuda_graphs = {}
-
-        torch.distributed.barrier(group=self.process_group)
-        num_layers, num_kv_heads, head_size = self.get_layer_config(model)
-        super().__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=num_layers,
-            num_kv_heads=num_kv_heads,
-            head_size=head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-            sliding_window=config.sliding_window,
-        )
-
-    def get_layer_config(self, model) -> Tuple[int, int, int]:
-        return (
-            len(model.model.layers),
-            model.model.num_key_value_heads,
-            model.model.head_size,
-        )
-
-    @property
-    def supports_adapter_loading(self) -> bool:
-        return True
-
-    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
-        layer_weights = {}
-
-        prefix = "model.layers"
-
-        # This accounts for VLMs (e.g. LlavaNext, Idefics2)
-        # that have a language_model inside of the larger model.
-        if hasattr(self.model, "language_model"):
-            _model = self.model.language_model
-        elif hasattr(self.model, "text_model"):
-            _model = self.model.text_model
-        else:
-            _model = self.model
-
-        for i, layer in enumerate(_model.model.layers):
-            layer_weights[(i, "q_proj")] = (
-                f"{prefix}.{i}.self_attn.q_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "k_proj")] = (
-                f"{prefix}.{i}.self_attn.k_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "v_proj")] = (
-                f"{prefix}.{i}.self_attn.v_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "o_proj")] = (
-                f"{prefix}.{i}.self_attn.o_proj",
-                layer.self_attn.o_proj,
-            )
-
-            # TODO: this is a hack to avoid the gate_proj for
-            # FlashStarcoder2 that doesnt have these layers
-            if hasattr(layer, "mlp") and hasattr(layer.mlp, "gate_up_proj"):
-                layer_weights[(i, "gate_proj")] = (
-                    f"{prefix}.{i}.mlp.gate_proj",
-                    layer.mlp.gate_up_proj,
-                )
-                layer_weights[(i, "up_proj")] = (
-                    f"{prefix}.{i}.mlp.up_proj",
-                    layer.mlp.gate_up_proj,
-                )
-                layer_weights[(i, "down_proj")] = (
-                    f"{prefix}.{i}.mlp.down_proj",
-                    layer.mlp.down_proj,
-                )
-
-        layer_weights[(0, "lm_head")] = ("lm_head", _model.lm_head)
-        return layer_weights
-
-    @property
-    def adapter_layers(self) -> List[str]:
-        return ADAPTER_LAYERS
-
-    @property
-    def default_traced_adapter_layers(self) -> List[str]:
-        return ["q_proj", "v_proj"]
-
-    def get_num_layers_for_type(self, layer_type: str) -> int:
-        return 1 if layer_type == "lm_head" else len(self.model.model.layers)
-
-    def is_row_parallel(self, layer_type: str) -> bool:
-        return layer_type in ROW_PARALLEL
-
-
-class FlashMistral(BaseFlashMistral):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        super(FlashMistral, self).__init__(
-            config_cls=MistralConfig,
-            model_cls=FlashMistralForCausalLM,
-            model_id=model_id,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
diff --git a/server/text_generation_server/models/flash_mixtral.py b/server/text_generation_server/models/flash_mixtral.py
deleted file mode 100644
index 587d423f94bd959876587069586e03683ed428ec..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_mixtral.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import torch
-
-from typing import Optional
-
-from text_generation_server.models.flash_mistral import BaseFlashMistral
-from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
-    MixtralConfig,
-    FlashMixtralForCausalLM,
-)
-
-
-class FlashMixtral(BaseFlashMistral):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        super(FlashMixtral, self).__init__(
-            config_cls=MixtralConfig,
-            model_cls=FlashMixtralForCausalLM,
-            model_id=model_id,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
deleted file mode 100644
index ac1fd5732f3b5f61f19c4bd421c82dbcd821a543..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_neox.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoTokenizer, AutoConfig
-from typing import Optional
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_neox_modeling import (
-    FlashGPTNeoXForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashNeoXSharded(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashNeoX is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashGPTNeoXForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashNeoXSharded, self).__init__(
-            model_id=model_id,
-            model=model.to(device),
-            tokenizer=tokenizer,
-            num_layers=len(model.gpt_neox.layers),
-            num_kv_heads=model.gpt_neox.num_heads,
-            head_size=model.gpt_neox.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_phi.py b/server/text_generation_server/models/flash_phi.py
deleted file mode 100644
index a530d1c3fe5a10508f258c9eda54f63d53746ead..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_phi.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoConfig, AutoTokenizer
-from typing import Optional
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_phi_modeling import (
-    FlashPhiForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashPhi(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashPhi is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashPhiForCausalLM(config, weights)
-        if speculator:
-            from text_generation_server.utils.medusa import MedusaModel
-            from huggingface_hub import hf_hub_download
-            import json
-            import os
-            from pathlib import Path
-
-            is_local_model = (
-                Path(speculator).exists() and Path(speculator).is_dir()
-            ) or os.getenv("WEIGHTS_CACHE_OVERRIDE", None) is not None
-
-            if not is_local_model:
-                medusa_config = hf_hub_download(
-                    speculator, revision=revision, filename="config.json"
-                )
-                medusa_head = hf_hub_download(
-                    speculator, revision=revision, filename="medusa_lm_head.pt"
-                )
-            else:
-                medusa_config = str(Path(speculator) / "config.json")
-                medusa_head = str(Path(speculator) / "medusa_lm_head.pt")
-
-            with open(medusa_config, "r") as f:
-                config = json.load(f)
-            medusa_sf = medusa_head[: -len(".pt")] + ".safetensors"
-            weights = Weights(
-                [medusa_sf], device, dtype, process_group=self.process_group
-            )
-            lm_head = model.lm_head
-            model.lm_head = MedusaModel(config, weights, lm_head)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashPhi, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_qwen2.py b/server/text_generation_server/models/flash_qwen2.py
deleted file mode 100644
index 3e88bac9859d7a3b141c63999bbdcb46ace9d454..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_qwen2.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import math
-
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoTokenizer, AutoConfig
-from typing import Optional
-
-from text_generation_server.models.flash_mistral import (
-    BaseFlashMistral,
-    set_sliding_window,
-)
-from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
-    Qwen2ForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashQwen2(BaseFlashMistral):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashQwen2 is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-        config.sliding_window = config.sliding_window if config.use_sliding_window else None
-
-        # Set context windows
-        if config.sliding_window is not None:
-            set_sliding_window(config.sliding_window)
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = Qwen2ForCausalLM(config, weights)
-
-        self.cuda_graphs = {}
-
-        torch.distributed.barrier(group=self.process_group)
-        super(BaseFlashMistral, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-            sliding_window=config.sliding_window,
-        )
diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py
deleted file mode 100644
index b1f75adc83f8443a775e8207e9d4a182f48bf729..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_rw.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoTokenizer
-from typing import Optional
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_rw_modeling import (
-    RWConfig,
-    FlashRWForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashRWSharded(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashRW is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = RWConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device,
-            dtype,
-            process_group=self.process_group,
-            aliases={
-                "lm_head.weight": ["transformer.word_embeddings.weight"],
-                "transformer.word_embeddings.weight": ["lm_head.weight"],
-            },
-        )
-
-        config.quantize = quantize
-        config.speculator = speculator
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashRWForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashRWSharded, self).__init__(
-            model_id=model_id,
-            model=model.to(device),
-            tokenizer=tokenizer,
-            num_layers=len(model.transformer.h),
-            num_kv_heads=model.transformer.cache_size,
-            head_size=model.transformer.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
deleted file mode 100644
index e1a7b36e85252003ec3635e29b26d5b94c80956a..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_santacoder.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoTokenizer, AutoConfig
-from typing import Optional, List
-import json
-import os
-
-from huggingface_hub import hf_hub_download
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_santacoder_modeling import (
-    FlashSantacoderForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-from text_generation_server.utils.import_utils import SYSTEM
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashSantacoderSharded(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashSantacoderSharded is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=True,
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-        config.transpose = config.architectures[0].startswith("GPT2")
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device=device,
-            dtype=dtype,
-            process_group=self.process_group,
-            aliases={"transformer.wte.weight": ["lm_head.weight"]},
-        )
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashSantacoderForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashSantacoderSharded, self).__init__(
-            model_id=model_id,
-            model=model.to(device),
-            tokenizer=tokenizer,
-            num_layers=len(model.transformer.h),
-            num_kv_heads=1,
-            head_size=model.transformer.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    def decode(self, generated_ids: List[int]) -> str:
-        # Do not skip special tokens as they are used for custom parsing rules of the generated text
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
-        )
diff --git a/server/text_generation_server/models/flash_starcoder2.py b/server/text_generation_server/models/flash_starcoder2.py
deleted file mode 100644
index 369e9e4c1f1008e3df6e49f88a70d09b4a0c2f1e..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/flash_starcoder2.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import math
-
-import torch
-
-from typing import Optional
-
-from transformers.models.gpt2 import GPT2TokenizerFast
-
-from text_generation_server.models.flash_mistral import (
-    BaseFlashMistral,
-    set_sliding_window,
-)
-from text_generation_server.models.custom_modeling.flash_starcoder2_modeling import (
-    Starcoder2Config,
-    FlashStarcoder2ForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-# Starcoder2 has the same base as Mistral
-class FlashStarcoder2(BaseFlashMistral):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashStarcoder2 is only available on GPU")
-
-        tokenizer = GPT2TokenizerFast.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = Starcoder2Config.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        # Set context windows
-        if config.sliding_window is not None:
-            set_sliding_window(config.sliding_window)
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashStarcoder2ForCausalLM(config, weights)
-
-        self.cuda_graphs = {}
-
-        torch.distributed.barrier(group=self.process_group)
-        super(BaseFlashMistral, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-            sliding_window=config.sliding_window,
-        )
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index 30c92d90e2769f4391fad43f7da0da331cf5824d..7c4e462c7a86c02989fe6f10a869e7349d1b5fef 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -2,23 +2,15 @@ import re
 import torch
 import torch.distributed
 
-from typing import List, Optional, Type
 
 from transformers import (
-    AutoTokenizer,
-    AutoConfig,
     PreTrainedTokenizerBase,
 )
-from text_generation_server.models import CausalLM
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.pb import generate_pb2
-from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
 from text_generation_server.utils import (
     NextTokenChooser,
     StoppingCriteria,
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
 )
 from text_generation_server.utils.chunks import concat_text_chunks
 
@@ -162,83 +154,3 @@ class GalacticaCausalLMBatch(CausalLMBatch):
             padding_right_offset=padding_right_offset,
             max_tokens=max_tokens,
         )
-
-
-class GalacticaSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            tp_parallel=True,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-        tokenizer.pad_token_id = config.pad_token_id
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = OPTForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    @property
-    def batch_type(self) -> Type[CausalLMBatch]:
-        return GalacticaCausalLMBatch
-
-    def decode(self, generated_ids: List[int]) -> str:
-        # Do not skip special tokens as they are used for custom parsing rules of the generated text
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
-        )
-
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ):
-        outputs, speculative_logits = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-        return outputs.logits, speculative_logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py
index 06035ccdd99365e63a1bd2a38ac003babbf58e48..4ac6a6b499f8b2398d392d242722e4f3b3146929 100644
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@@ -1,15 +1,40 @@
 import torch
 import os
 from loguru import logger
-from typing import Dict
+from typing import Dict, Optional
+
+from text_generation_server.utils.log import log_master
+
+ATTENTION = os.environ["ATTENTION"]
+# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
+PREFIX_CACHING = os.environ["PREFIX_CACHING"].lower() in {
+    "1",
+    "true",
+}
+PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "0").lower() in {"1", "true"}
+log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
+_expected = {"paged", "flashdecoding", "flashinfer"}
+assert (
+    ATTENTION in _expected
+), f"Attention is not valid {ATTENTION}, expected {_expected}"
+log_master(logger.info, f"Using Attention = {ATTENTION}")
+
+if PREFIX_CACHING and ATTENTION not in {"flashinfer", "flashdecoding"}:
+    raise RuntimeError("Prefix caching is only supported with flashinfer")
 
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
-# This is overridden by the cli
-FLASH_DECODING = os.getenv("FLASH_DECODING") in {"1", "true", "True"}
-BLOCK_SIZE: int = 256 if FLASH_DECODING else 16
-if FLASH_DECODING:
-    logger.info("Using FLASH_DECODING")
+TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
+assert TGI_WIGGLE_ROOM > 0
+assert TGI_WIGGLE_ROOM < 1
 
+# This is overridden by the cli
+BLOCK_SIZE: int
+if ATTENTION == "flashdecoding":
+    BLOCK_SIZE = 256
+elif ATTENTION == "flashinfer":
+    BLOCK_SIZE = 1
+else:
+    BLOCK_SIZE = 16
 
 cuda_graphs = os.getenv("CUDA_GRAPHS")
 if cuda_graphs is not None:
@@ -26,23 +51,11 @@ else:
 if cuda_graphs is not None:
     cuda_graphs.sort(reverse=True)
 
-
 CUDA_GRAPHS = cuda_graphs
 
-# This is overridden at model loading.
-global MODEL_ID
-MODEL_ID = None
-
-
-def set_model_id(model_id: str):
-    global MODEL_ID
-    MODEL_ID = model_id
-
-
 # NOTE: eventually we should move this into the router and pass back the
 # index in all cases.
-global ADAPTER_TO_INDEX
-ADAPTER_TO_INDEX: Dict[str, int] = None
+ADAPTER_TO_INDEX: Optional[Dict[str, int]] = None
 
 
 def set_adapter_to_index(adapter_to_index: Dict[str, int]):
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
deleted file mode 100644
index c37cfb7da722231363c3f3588d4f4f07e30218d3..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/gpt_neox.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import Optional
-
-from transformers import (
-    AutoTokenizer,
-    AutoConfig,
-)
-from text_generation_server.models import CausalLM
-from text_generation_server.models.custom_modeling.neox_modeling import (
-    GPTNeoxForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-class GPTNeoxSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = GPTNeoxForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ):
-        outputs, speculative_logits = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-
-        return outputs.logits, speculative_logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py
deleted file mode 100644
index f2955bd03c7ecb262cc404c87c9d9fbd478c9c3a..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/idefics.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import List, Optional, Tuple
-
-from transformers import (
-    AutoTokenizer,
-    AutoConfig,
-    AutoProcessor,
-)
-
-from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
-from text_generation_server.models.custom_modeling.idefics_processing import (
-    IdeficsProcessor,
-)
-from transformers import LlamaTokenizerFast
-from text_generation_server.models.custom_modeling.idefics_modeling import (
-    IdeficsForVisionText2Text,
-)
-from text_generation_server.models.idefics_causal_lm import IdeficsCausalLM
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-class IDEFICSSharded(IdeficsCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            # 9b seems to work correctly enough in float16, but 80b seems
-            # to be really saturating for f16.
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-        self.device, self.dtype = device, dtype
-
-        config = IdeficsConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-        config.vision_config.quantize = quantize
-
-        tokenizer = LlamaTokenizerFast.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        self.processor = IdeficsProcessor.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device=device,
-            dtype=dtype,
-            process_group=self.process_group,
-        )
-
-        model = IdeficsForVisionText2Text(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(IdeficsCausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/idefics2.py b/server/text_generation_server/models/idefics2.py
deleted file mode 100644
index 314c0500ddcf5a0eb4d65f9c71b7430c6ee60e62..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/idefics2.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import torch
-
-from typing import Optional, Tuple
-
-from transformers import (
-    AutoProcessor,
-)
-from text_generation_server.models.custom_modeling.idefics2 import (
-    Idefics2ForConditionalGeneration,
-)
-
-from text_generation_server.models.vlm_causal_lm import VlmCausalLM
-
-
-class Idefics2(VlmCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.processor = AutoProcessor.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            # XXX: Extremely important to cap resolution in order to limit
-            # VRAM usage.
-            size={"longest_edge": 448, "shortest_edge": 378},
-        )
-        super().__init__(
-            model_cls=Idefics2ForConditionalGeneration,
-            model_id=model_id,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
-    def get_layer_config(self, model) -> Tuple[int, int, int]:
-        return (
-            len(model.text_model.model.layers),
-            model.text_model.model.num_key_value_heads,
-            model.text_model.model.head_size,
-        )
-
-    def max_past(self) -> Optional[int]:
-        return getattr(self.model.text_model, "max_past", None)
diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
index 6c5629808c90ad20343e3ee7bd14586d1d044e84..34b74ba8856bf3e183a2f63281df633aeeeb89b4 100644
--- a/server/text_generation_server/models/idefics_causal_lm.py
+++ b/server/text_generation_server/models/idefics_causal_lm.py
@@ -6,6 +6,7 @@ import time
 from dataclasses import dataclass
 from opentelemetry import trace
 from transformers import (
+    AutoConfig,
     AutoProcessor,
     AutoTokenizer,
     PreTrainedTokenizerBase,
@@ -22,6 +23,18 @@ from text_generation_server.models.types import (
 )
 from text_generation_server.pb import generate_pb2
 from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
+import torch.distributed
+from text_generation_server.models.custom_modeling.idefics_modeling import (
+    IdeficsForVisionText2Text,
+)
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+from text_generation_server.utils.quantization import get_loader
+
+from text_generation_server.utils.import_utils import SYSTEM
 
 
 tracer = trace.get_tracer(__name__)
@@ -70,6 +83,7 @@ class IdeficsCausalLMBatch(Batch):
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.max_tokens,
+            current_tokens=len(self),
         )
 
     @classmethod
@@ -289,7 +303,7 @@ class IdeficsCausalLMBatch(Batch):
             image_hidden_states = self.image_hidden_states[keep_indices]
 
         # Ensure that past_key_values tensors can be updated in-place
-        if type(self.past_key_values[0]) == tuple:
+        if type(self.past_key_values[0]) is tuple:
             self.past_key_values = [list(layer) for layer in self.past_key_values]
 
         # Update tensors in-place to allow incremental garbage collection
@@ -456,7 +470,7 @@ class IdeficsCausalLMBatch(Batch):
             # BLOOM Keys:   [batch_size * num_heads, head_dim, seq_length]
             # BLOOM Values: [batch_size * num_heads, seq_length, head_dim]
             # And ensure that we can update tensors in-place
-            if type(batch.past_key_values[0]) == tuple:
+            if isinstance(batch.past_key_values[0], tuple):
                 batch.past_key_values = [
                     [t.view(len(batch), -1, *t.shape[-2:]) for t in layer]
                     for layer in batch.past_key_values
@@ -577,22 +591,38 @@ class IdeficsCausalLM(Model):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
-        from text_generation_server.models.custom_modeling.idefics_modeling import (
-            IdeficsForVisionText2Text,
-        )
-
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
-            device = torch.device("cuda")
-            dtype = torch.bfloat16 if dtype is None else dtype
+            device = torch.device(f"cuda:{rank}")
+            # 9b seems to work correctly enough in float16, but 80b seems
+            # to be really saturating for f16.
+            dtype = torch.float16 if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = torch.float16 if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
         else:
-            if quantize:
-                raise ValueError("quantization is not available on CPU")
-
             device = torch.device("cpu")
             dtype = torch.float32 if dtype is None else dtype
+        self.device, self.dtype = device, dtype
+
+        config = AutoConfig.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+        config.vision_config.quantize = quantize
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
@@ -608,38 +638,34 @@ class IdeficsCausalLM(Model):
             truncation_side="left",
             trust_remote_code=trust_remote_code,
         )
-        model = IdeficsForVisionText2Text.from_pretrained(
-            model_id,
-            revision=revision,
-            torch_dtype=dtype,
-            device_map=(
-                "auto"
-                if torch.cuda.is_available() and torch.cuda.device_count() > 1
-                else None
-            ),
-            load_in_8bit=quantize == "bitsandbytes",
-            trust_remote_code=trust_remote_code,
+
+        weights_loader = get_loader(
+            quantize=quantize, model_id=model_id, revision=revision
         )
-        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
-            model = model.cuda()
-
-        if tokenizer.pad_token_id is None:
-            if model.config.pad_token_id is not None:
-                tokenizer.pad_token_id = model.config.pad_token_id
-            elif model.config.eos_token_id is not None:
-                tokenizer.pad_token_id = model.config.eos_token_id
-            elif tokenizer.eos_token_id is not None:
-                tokenizer.pad_token_id = tokenizer.eos_token_id
-            else:
-                tokenizer.add_special_tokens({"pad_token": "<unk>"})
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device=device,
+            dtype=dtype,
+            process_group=self.process_group,
+            weights_loader=weights_loader,
+        )
+
+        model = IdeficsForVisionText2Text(config, weights)
+
+        self.config = config
 
-        super(IdeficsCausalLM, self).__init__(
+        torch.distributed.barrier(group=self.process_group)
+        super().__init__(
             model_id=model_id,
             model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
             device=device,
+            rank=rank,
+            world_size=world_size,
         )
 
     @property
diff --git a/server/text_generation_server/models/llava_next.py b/server/text_generation_server/models/llava_next.py
deleted file mode 100644
index effe8b9107cabc1e09d2fb290568da864cb7eb4e..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/llava_next.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import torch
-
-from typing import Optional, Tuple
-
-from transformers import (
-    AutoProcessor,
-)
-from text_generation_server.models.custom_modeling.llava_next import (
-    LlavaNextForConditionalGeneration,
-)
-
-from text_generation_server.models.vlm_causal_lm import VlmCausalLM
-
-
-class LlavaNext(VlmCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.processor = AutoProcessor.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        super().__init__(
-            model_cls=LlavaNextForConditionalGeneration,
-            model_id=model_id,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
-    def get_layer_config(self, model) -> Tuple[int, int, int]:
-        return (
-            len(model.language_model.model.layers),
-            model.language_model.model.num_key_value_heads,
-            model.language_model.model.head_size,
-        )
-
-    def max_past(self) -> Optional[int]:
-        return getattr(self.model.language_model, "max_past", None)
diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py
index 9189b45c5343b418bcbd03b0fd68c951275e34e2..dfc61fb88752456a2d156f2d7f0f0204b05dd5b0 100644
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@@ -2,7 +2,6 @@ import torch
 import torch.distributed
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 from typing import Optional
-import os
 from text_generation_server.models.custom_modeling.mamba_modeling import (
     MambaConfig,
 )
@@ -20,7 +19,7 @@ from text_generation_server.models.custom_modeling.mamba_modeling import (
     InferenceParams,
 )
 from text_generation_server.models import Model
-from typing import Any, List, Optional, Tuple, Type, Dict
+from typing import Any, List, Tuple, Type, Dict
 from text_generation_server.models.types import (
     Batch,
     Tokens,
@@ -28,9 +27,10 @@ from text_generation_server.models.types import (
     GeneratedText,
 )
 from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.quantization import get_loader
 from text_generation_server.utils.tokens import batch_top_tokens, Sampling
 from dataclasses import dataclass
-from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
+from text_generation_server.utils import NextTokenChooser, StoppingCriteria
 
 
 def new_inference_params(
@@ -116,6 +116,7 @@ class MambaBatch(Batch):
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.max_tokens,
+            current_tokens=len(self),
         )
 
     @classmethod
@@ -298,7 +299,6 @@ class MambaBatch(Batch):
         stopping_criterias = []
         top_n_tokens = []
         max_tokens = 0
-        max_seqlen = 0
         seqlen_offset = 0
 
         (n_blocks, _, d_inner, d_conv) = batches[0].inference_params.conv_states.shape
@@ -413,6 +413,7 @@ class Mamba(Model):
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
+        self.quantize = quantize
         self.process_group, _rank, world_size = initialize_torch_distributed()
         if world_size > 1:
             raise RuntimeError("Mamba does not support Tensor Parallelism (TP)")
@@ -448,8 +449,17 @@ class Mamba(Model):
         config.quantize = quantize
         config.speculator = speculator
         torch.distributed.barrier(group=self.process_group)
+        weights_loader = get_loader(
+            quantize=quantize, model_id=model_id, revision=revision
+        )
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
+        weights = Weights(
+            filenames,
+            device,
+            dtype,
+            process_group=self.process_group,
+            weights_loader=weights_loader,
+        )
         model = MambaModel(config, weights)
         torch.distributed.barrier(group=self.process_group)
         super(Mamba, self).__init__(
@@ -475,7 +485,7 @@ class Mamba(Model):
                     for bs in CUDA_GRAPHS:
                         self.cuda_graph_warmup(bs)
                 except Exception:
-                    logger.exception(f"Decode cuda graph warmup failed")
+                    logger.exception("Decode cuda graph warmup failed")
         else:
             logger.info(f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS}).")
 
@@ -524,7 +534,7 @@ class Mamba(Model):
         }
         self.cuda_graphs[batch_size] = graph_dict
 
-    def tunableop_warmup(self, seqlen: int):
+    def tunableop_warmup(self, batch_size: int, seqlen: int):
         input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
         n_blocks = len(self.model.blocks)
 
diff --git a/server/text_generation_server/models/metadata_kernels.py b/server/text_generation_server/models/metadata_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e2160dc08a0257b6655dfdbcee4752e3ad69d9
--- /dev/null
+++ b/server/text_generation_server/models/metadata_kernels.py
@@ -0,0 +1,347 @@
+import torch
+import triton
+
+import triton.language as tl
+
+from loguru import logger
+from typing import List, Optional
+from torch.utils._triton import has_triton as has_triton_torch
+
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+from text_generation_server.utils.log import log_master
+
+_HAS_TRITON: Optional[bool] = None
+
+
+def has_triton():
+    global _HAS_TRITON
+    if _HAS_TRITON is None:
+        # FIXME: it seems that has_triton_torch is bugged on RocM
+        #        For now, only accept cuda
+        _HAS_TRITON = has_triton_torch() if SYSTEM == "cuda" else False
+        if _HAS_TRITON:
+            log_master(logger.info, "Using optimized Triton indexing kernels.")
+
+    return _HAS_TRITON
+
+
+def block_tables_to_padded(
+    max_blocks: int,
+    cu_seqlen: torch.Tensor,
+    block_tables: torch.Tensor,
+    block_tables_ragged: torch.Tensor,
+):
+    def grid(meta):
+        return (
+            triton.cdiv(max_blocks, meta["BLOCK_SIZE"]),
+            len(block_tables),
+        )
+
+    triton_block_tables_to_padded[grid](
+        cu_seqlen,
+        block_tables,
+        block_tables_ragged,
+        block_tables.shape[1],
+        BLOCK_SIZE=256,
+    )
+
+
+def block_tables_to_ragged(
+    *,
+    block_tables: torch.Tensor,
+    input_lengths: List[int],
+    cache_lengths: List[int],
+    input_lengths_tensor: torch.Tensor,
+    cache_lengths_tensor: torch.Tensor,
+    max_current_length: int
+) -> torch.Tensor:
+    """Convert block table to ragged format compatible with FlashInfer."""
+    assert len(input_lengths) == len(cache_lengths)
+
+    total_len = sum(input_lengths) + sum(cache_lengths)
+    block_tables_ragged = torch.empty(
+        total_len, dtype=torch.int32, device=block_tables.device
+    )
+
+    if has_triton():
+        cu_seqlen = torch.nn.functional.pad(
+            torch.cumsum(input_lengths_tensor + cache_lengths_tensor, dim=0), (1, 0)
+        )
+
+        def grid(meta):
+            return (
+                triton.cdiv(max_current_length, meta["BLOCK_SIZE"]),
+                len(cache_lengths),
+            )
+
+        triton_block_tables_to_ragged[grid](
+            cu_seqlen,
+            block_tables,
+            block_tables_ragged,
+            block_tables.shape[1],
+            BLOCK_SIZE=256,
+        )
+    else:
+        offset = 0
+        for i, (input_length, cache_length) in enumerate(
+            zip(input_lengths, cache_lengths)
+        ):
+            seq_len = cache_length + input_length
+            block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len]
+            offset += seq_len
+
+    return block_tables_ragged
+
+
+def copy_next_input_ids_inplace(
+    max_next_input_ids: int,
+    all_input_ids: torch.Tensor,
+    cache_lengths: torch.Tensor,
+    input_lengths: torch.Tensor,
+    prompt_lengths: torch.Tensor,
+    next_input_ids: torch.Tensor,
+    cu_accepted_ids: torch.Tensor,
+):
+    def grid(meta):
+        return (
+            triton.cdiv(max_next_input_ids, meta["BLOCK_SIZE"]),
+            len(all_input_ids),
+        )
+
+    triton_copy_next_input_ids_inplace[grid](
+        all_input_ids,
+        cache_lengths,
+        input_lengths,
+        prompt_lengths,
+        next_input_ids,
+        cu_accepted_ids,
+        all_input_ids.shape[1],
+        BLOCK_SIZE=16,
+    )
+
+
+def prepare_position_slot_ids(
+    max_input_length: int,
+    cache_lengths: torch.Tensor,
+    cu_seqlen: torch.Tensor,
+    cu_slots: torch.Tensor,
+    position_ids: torch.Tensor,
+    slot_indices: torch.Tensor,
+):
+    def grid(meta):
+        return (
+            triton.cdiv(max_input_length, meta["BLOCK_SIZE"]),
+            len(cache_lengths),
+        )
+
+    triton_prepare_position_slot_ids[grid](
+        cache_lengths, cu_seqlen, cu_slots, position_ids, slot_indices, BLOCK_SIZE=256
+    )
+
+
+def slots_filtering(
+    max_slots: int,
+    slots: torch.Tensor,
+    filtered_slots: torch.Tensor,
+    cu_slots: torch.Tensor,
+    slots_start: torch.Tensor,
+):
+    def grid(meta):
+        return (
+            triton.cdiv(max_slots, meta["BLOCK_SIZE"]),
+            len(slots_start),
+        )
+
+    triton_slots_filtering[grid](
+        slots, filtered_slots, slots_start, cu_slots, BLOCK_SIZE=256
+    )
+
+
+@triton.jit
+def triton_slots_filtering(
+    # Inputs
+    slots_ptr,
+    filtered_slots_ptr,
+    slots_start_ptr,
+    cu_slots_ptr,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in block_tables_ragged.numel() / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    filter_start = tl.load(slots_start_ptr + bid)
+
+    slot_start = tl.load(cu_slots_ptr + bid)
+    slot_end = tl.load(cu_slots_ptr + bid + 1)
+
+    mask = (slot_start + block_arange) < slot_end
+
+    slots = tl.load(slots_ptr + filter_start + block_arange, mask=mask)
+    tl.store(filtered_slots_ptr + slot_start + block_arange, slots, mask=mask)
+
+
+@triton.jit
+def triton_block_tables_to_padded(
+    # Inputs
+    cu_seqlen_ptr,
+    # Outputs
+    block_tables_ptr,
+    block_tables_ragged_ptr,
+    # Stride
+    stride_block_tables,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in block_tables_ragged.numel() / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    seq_start = tl.load(cu_seqlen_ptr + bid)
+    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
+
+    mask = (seq_start + block_arange) < seq_end
+
+    blocks = tl.load(block_tables_ragged_ptr + seq_start + block_arange, mask=mask)
+    tl.store(
+        block_tables_ptr + bid * stride_block_tables + block_arange, blocks, mask=mask
+    )
+
+
+@triton.jit
+def triton_block_tables_to_ragged(
+    # Inputs
+    cu_seqlen_ptr,
+    # Outputs
+    block_tables_ptr,
+    block_tables_ragged_ptr,
+    # Stride
+    stride_block_tables,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in block_tables_ragged.numel() / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    seq_start = tl.load(cu_seqlen_ptr + bid)
+    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
+
+    mask = (seq_start + block_arange) < seq_end
+
+    blocks = tl.load(
+        block_tables_ptr + bid * stride_block_tables + block_arange, mask=mask
+    )
+    tl.store(block_tables_ragged_ptr + seq_start + block_arange, blocks, mask=mask)
+
+
+@triton.jit
+def triton_copy_next_input_ids_inplace(
+    # Inputs
+    all_input_ids_ptr,
+    cache_lengths_ptr,
+    input_lengths_ptr,
+    prompt_lengths_ptr,
+    next_input_ids_ptr,
+    cu_accepted_ids_ptr,
+    # Stride
+    stride_all_input_ids,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in max_accepted_ids / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    # Used for correctly indexing in all_input_ids
+    cache_length = tl.load(cache_lengths_ptr + bid)
+    input_length = tl.load(input_lengths_ptr + bid)
+    prompt_length = tl.load(prompt_lengths_ptr + bid)
+
+    # Start/End of next_input_ids for this request
+    next_input_ids_start = tl.load(cu_accepted_ids_ptr + bid)
+    next_input_ids_end = tl.load(cu_accepted_ids_ptr + bid + 1)
+
+    # Mask values out of range
+    mask = (next_input_ids_start + block_arange) < next_input_ids_end
+
+    # Mask values for request still prefilling
+    decode_mask = (cache_length + input_length + block_arange) >= prompt_length
+
+    mask = mask & decode_mask
+
+    # Load this request next input ids
+    next_input_ids = tl.load(
+        next_input_ids_ptr + next_input_ids_start + block_arange, mask=mask
+    )
+
+    # Store in all_input_ids, since it is a 2D tensor, apply stride * bid
+    tl.store(
+        all_input_ids_ptr
+        + stride_all_input_ids * bid
+        + cache_length
+        + input_length
+        + block_arange,
+        next_input_ids,
+        mask=mask,
+    )
+
+
+@triton.jit
+def triton_prepare_position_slot_ids(
+    # Inputs
+    cache_lengths_ptr,
+    cu_seqlen_ptr,
+    cu_slots_ptr,
+    # Outputs
+    position_ids_ptr,
+    slot_indices_ptr,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in max_input_length / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    cache_length = tl.load(cache_lengths_ptr + bid)
+
+    seq_start = tl.load(cu_seqlen_ptr + bid)
+    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
+
+    slot_start = tl.load(cu_slots_ptr + bid)
+
+    mask = (seq_start + block_arange) < seq_end
+
+    tl.store(
+        position_ids_ptr + seq_start + block_arange,
+        cache_length + block_arange,
+        mask=mask,
+    )
+    tl.store(
+        slot_indices_ptr + seq_start + block_arange,
+        slot_start + cache_length + block_arange,
+        mask=mask,
+    )
diff --git a/server/text_generation_server/models/mllama_causal_lm.py b/server/text_generation_server/models/mllama_causal_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..28e7489eaba85cf8bcbaed0a990336666348f4d7
--- /dev/null
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@@ -0,0 +1,380 @@
+import torch
+
+import numpy as np
+
+from typing import Iterable, Optional, Tuple, List, Dict
+from text_generation_server.pb.generate_pb2 import Request
+from io import BytesIO
+from PIL import Image
+from dataclasses import dataclass
+from opentelemetry import trace
+from transformers import (
+    PreTrainedTokenizerBase,
+)
+
+from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
+from text_generation_server.layers.attention import Seqlen
+from text_generation_server.models.metadata_kernels import block_tables_to_ragged
+
+
+tracer = trace.get_tracer(__name__)
+
+
+@dataclass
+class MllamaCausalLMBatch(VlmCausalLMBatch):
+    image_indices: List[int] = 42
+    aspect_ratio_ids: Optional[torch.Tensor] = None
+    aspect_ratio_mask: Optional[torch.Tensor] = None
+    cross_attention_states: Optional[torch.Tensor] = None
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(cls, batches):
+        batch = super().concatenate(batches)
+        batch.pixel_values = None
+        batch.pixel_attention_mask = None
+
+        offset = 0
+        image_indices = []
+        attention_states = []
+        for b in batches:
+            if b.cross_attention_states is not None:
+                attention_states.append(b.cross_attention_states)
+            image_indices.extend([i + offset for i in b.image_indices])
+            offset += len(b.image_indices)
+        if len(attention_states) > 0:
+            assert len(image_indices) > 0
+            batch.cross_attention_states = torch.cat(attention_states, dim=0)
+            batch.image_indices = image_indices
+        else:
+            batch.cross_attention_states = None
+            batch.image_indices = []
+        return batch
+
+    @tracer.start_as_current_span("filter")
+    def filter(self, request_ids: List[int]):
+        assert self.image_indices is not None
+        batch = super().filter(request_ids)
+        assert self.image_indices is not None
+        indices = []
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            indices.append(idx)
+
+        offset = 0
+        new_image_indices = []
+        prev_i = None
+        for i in self.image_indices:
+            if i in indices:
+                new_image_indices.append(offset)
+                if i != prev_i:
+                    offset += 1
+                prev_i = i
+
+        batch.image_indices = new_image_indices
+        if len(new_image_indices) > 0:
+            assert max(new_image_indices) < self.cross_attention_states.shape[0]
+            assert offset <= self.cross_attention_states.shape[0]
+            batch.cross_attention_states = self.cross_attention_states[
+                new_image_indices
+            ]
+        else:
+            batch.cross_attention_states = None
+        return batch
+
+    @classmethod
+    def batch_tokenized_inputs(
+        cls, requests: Iterable[Request], tokenizer, processor, config
+    ):
+        image_inputs = []
+        texts = []
+        image_indices = []
+        batch_tokenized_inputs = []
+
+        for i, r in enumerate(requests):
+            # Each input is encoded into a list, where each element of this input list is either a string or a URL
+            curr_text = ""
+            curr_image = None
+            curr_i = None
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    curr_text += chunk.text
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    # TODO unsure about BOS
+                    curr_text += "<|image|>"
+                    image_input = processor.image_processor(image, return_tensors="pt")
+                    curr_image = image_input
+                    curr_i = i
+                    # image_inputs.append(image_input)
+                    # image_indices.append(i)
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+            texts.append(curr_text)
+            if curr_image is not None:
+                image_inputs.append(curr_image)
+                image_indices.append(curr_i)
+
+            input_ids = tokenizer(
+                curr_text,
+                truncation=True,
+                max_length=r.truncate,
+                add_special_tokens=r.add_special_tokens,
+            )["input_ids"]
+            batch_tokenized_inputs.append(input_ids)
+        if image_inputs:
+            image_input = image_inputs[0]
+            new_image_inputs = {
+                "pixel_values": torch.cat(
+                    [img["pixel_values"] for img in image_inputs], dim=0
+                ),
+            }
+            if "aspect_ratio_ids" in image_input:
+                new_image_inputs["aspect_ratio_ids"] = torch.cat(
+                    [img["aspect_ratio_ids"] for img in image_inputs], dim=0
+                )
+            if "aspect_ratio_mask" in image_input:
+                new_image_inputs["aspect_ratio_mask"] = torch.cat(
+                    [img["aspect_ratio_mask"] for img in image_inputs], dim=0
+                )
+            image_inputs = new_image_inputs
+            image_inputs["image_indices"] = image_indices
+        else:
+            image_inputs = None
+
+        if image_inputs is not None:
+            assert len(image_indices) == image_inputs["pixel_values"].shape[0]
+
+        return batch_tokenized_inputs, image_inputs
+
+    @classmethod
+    def from_pb_processor(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        processor,
+        config,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "VlmCausalLMBatch":
+        batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
+            pb.requests, tokenizer, processor, config
+        )
+        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
+        # XXX: <|image|> token is actually out of bounds and bugs out the logit processors.
+        batch.all_input_ids_tensor = batch.all_input_ids_tensor.clamp(
+            max=config.text_config.vocab_size - 1
+        )
+        if isinstance(batch.input_ids, list):
+            if len(batch) > 1:
+                input_ids = np.concatenate(batch.input_ids, dtype=np.int64)
+            else:
+                input_ids = batch.input_ids[0]
+            batch.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+
+        batch.input_ids = batch.input_ids.clamp(max=config.text_config.vocab_size - 1)
+
+        if image_inputs is not None:
+            batch.pixel_values = image_inputs["pixel_values"].to(
+                device=device, dtype=dtype
+            )
+            batch.aspect_ratio_ids = image_inputs["aspect_ratio_ids"].to(device=device)
+            batch.aspect_ratio_mask = image_inputs["aspect_ratio_mask"].to(
+                device=device
+            )
+            batch.image_indices = image_inputs["image_indices"]
+        else:
+            batch.pixel_values = None
+            batch.aspect_ratio_ids = None
+            batch.aspect_ratio_mask = None
+            batch.image_indices = []
+        assert batch.image_indices is not None
+        return batch
+
+
+class MllamaCausalLM(VlmCausalLM):
+    def forward(
+        self,
+        batch: MllamaCausalLMBatch,
+        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # Model Forward
+        if batch.speculative_ids is not None:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            max_s = batch.max_current_length
+            lm_head_indices = batch.prefill_head_indices
+
+            speculative_ids = batch.speculative_ids
+
+            B, speculative_length = speculative_ids.shape
+            new_length = speculative_length + 1
+            new_input_ids = torch.cat(
+                [input_ids.unsqueeze(-1), speculative_ids], dim=1
+            ).reshape(-1)
+            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
+            arange_int = arange.to(dtype=torch.int32)
+            new_position_ids = (
+                position_ids.unsqueeze(-1).expand(B, new_length) + arange
+            ).view(-1)
+            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
+            input_lengths = (
+                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
+            ).view(-1)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
+            ).reshape(-1)
+
+            # Add Copy the block tables for all members
+            block_tables = (
+                block_tables.unsqueeze(1)
+                .expand(B, new_length, -1)
+                .reshape(B * new_length, -1)
+                .contiguous()
+            )
+            max_s = max_s + speculative_length
+
+            input_ids = new_input_ids
+            position_ids = new_position_ids
+        else:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
+            lm_head_indices = batch.prefill_head_indices
+
+        if cu_seqlen_prefill is None and self.max_past() is not None:
+            # In decode, not prefill, we're actually overwriting the KV-cache
+            # in a circular buffer mode.
+            # This makes sure the max_s for the decode pass is correct.
+            max_s = min(self.max_past(), max_s)
+
+        # Try to find an associated cuda graph
+        bs = input_ids.shape[0]
+        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
+        if sorted_padded_bs:
+            # Get associated cuda graph
+            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
+        else:
+            cuda_graph = None
+        if (
+            cu_seqlen_prefill is not None
+            or cuda_graph is None
+            # Only run cuda graphs when there's no images.
+            or batch.cross_attention_states is not None
+        ):
+            if PREFIX_CACHING:
+                block_tables = block_tables_to_ragged(
+                    block_tables=block_tables,
+                    input_lengths=batch.input_lengths,
+                    cache_lengths=batch.cache_lengths,
+                    input_lengths_tensor=batch.input_lengths_tensor,
+                    cache_lengths_tensor=batch.cache_lengths_tensor,
+                    max_current_length=batch.max_current_length,
+                )
+            with self._forward_context(
+                block_tables=block_tables,
+                cu_seqlen_prefill=cu_seqlen_prefill,
+                input_lengths_tensor=input_lengths,
+                cache_lengths_tensor=cache_lengths_tensor,
+            ):
+                seqlen = Seqlen(
+                    input_lengths=input_lengths,
+                    cache_lengths=cache_lengths_tensor,
+                    cu_seqlen_q=cu_seqlen_prefill,
+                    max_q=batch.max_input_length,
+                    max_k=batch.max_current_length,
+                )
+
+                if batch.pixel_values is not None:
+                    cross_attention_states = self.model.vision_forward(
+                        pixel_values=batch.pixel_values,
+                        aspect_ratio_ids=batch.aspect_ratio_ids,
+                        aspect_ratio_mask=batch.aspect_ratio_mask,
+                    )
+                    batch.cross_attention_states = cross_attention_states
+
+                cross_attention_states = batch.cross_attention_states
+
+                logits, speculative_logits = self.model.forward(
+                    input_ids=input_ids,
+                    position_ids=position_ids,
+                    cu_seqlen_prefill=cu_seqlen_prefill,
+                    kv_cache=kv_cache,
+                    block_tables=block_tables,
+                    slots=slots,
+                    seqlen=seqlen,
+                    max_s=max_s,
+                    prefill_cache_indices=batch.prefill_cache_indices,
+                    lm_head_indices=lm_head_indices,
+                    cross_attention_states=cross_attention_states,
+                    adapter_data=adapter_data,
+                    image_indices=batch.image_indices[:],
+                )
+                if batch.prefill_cache_indices is not None:
+                    batch.prefill_cache_indices = None
+                if batch.pixel_values is not None:
+                    batch.pixel_values = None
+                return logits, speculative_logits
+
+        # Copy inputs to the static inputs of the cuda graph
+        # Static inputs are potentially padded
+        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
+        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
+        if ATTENTION == "flashinfer":
+            block_tables = block_tables_to_ragged(
+                block_tables=block_tables,
+                input_lengths=batch.input_lengths,
+                cache_lengths=batch.cache_lengths,
+                input_lengths_tensor=batch.input_lengths_tensor,
+                cache_lengths_tensor=batch.cache_lengths_tensor,
+                max_current_length=batch.max_current_length,
+            )
+            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
+        else:
+            cuda_graph["block_tables"][
+                : block_tables.shape[0], : block_tables.shape[1]
+            ] = block_tables
+
+        # XXX: This is working only because block 0 is reserved for the healthcheck
+        # so it doesn't matter if we override it with bogus values.
+        cuda_graph["slots"].fill_(0)
+        cuda_graph["slots"][: slots.shape[0]] = slots
+        cuda_graph["input_lengths"].zero_()
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
+        cuda_graph["cache_lengths"].zero_()
+        cuda_graph["cache_lengths"][
+            : cache_lengths_tensor.shape[0]
+        ] = cache_lengths_tensor
+
+        with self._forward_context(
+            block_tables=cuda_graph["block_tables"],
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=cuda_graph["input_lengths"],
+            cache_lengths_tensor=cuda_graph["cache_lengths"],
+            state=cuda_graph["state"],
+        ):
+            # Replay the graph
+            cuda_graph["graph"].replay()
+
+        # Slice output to the correct shape
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index c90fd38a644147c8001d348628b615f34c05c358..b36300135686a4a36c0d053e08d9d469cac5abf0 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -2,21 +2,23 @@ import inspect
 import torch
 
 from abc import ABC, abstractmethod
-from typing import List, Tuple, Optional, TypeVar, Type, Dict, DefaultDict
+from typing import List, Tuple, Optional, TypeVar, Type, Dict
 from collections import defaultdict
-from transformers import PreTrainedTokenizerBase, PretrainedConfig
+from transformers import PreTrainedTokenizerBase
+from loguru import logger
 
+from text_generation_server.models.globals import (
+    ATTENTION,
+    PREFIX_CACHING,
+    BLOCK_SIZE,
+    PREFILL_CHUNKING,
+)
 from text_generation_server.models.types import Batch, Generation
+from text_generation_server.utils.log import log_master
+from text_generation_server.utils.prefill_chunking import set_support_chunking
 from text_generation_server.utils.speculate import get_speculate
 from text_generation_server.pb.generate_pb2 import InfoResponse
 from text_generation_server.adapters.weights import LayerAdapterWeights
-from text_generation_server.utils.adapter import (
-    load_and_merge_adapters,
-    AdapterParameters,
-    AdapterSource,
-)
-from loguru import logger
-
 
 BASE_MODEL_ADAPTER_ID = "__base_model__"
 
@@ -38,6 +40,7 @@ class Model(ABC):
         sliding_window: Optional[int] = None,
         speculate: Optional[int] = None,
         adapter_id: str = BASE_MODEL_ADAPTER_ID,
+        support_chunking: bool = False,
     ):
         self.model_id = model_id
         self.model = model.eval()
@@ -60,7 +63,6 @@ class Model(ABC):
         self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
             LayerAdapterWeights
         )
-        self.target_to_layer = self.adapter_target_to_layer()
         self.loaded_adapters = set()
         self.static_adapter_id = adapter_id
 
@@ -68,6 +70,29 @@ class Model(ABC):
             speculate = get_speculate()
         self.speculate = speculate
 
+        support_chunking = support_chunking and PREFILL_CHUNKING
+
+        if speculate != 0 and support_chunking:
+            log_master(
+                logger.warning,
+                "Prefill chunking does not support speculation yet. "
+                "Prefill chunking will be turned off",
+            )
+            support_chunking = False
+        if ATTENTION not in ["flashinfer", "flashdecoding"] and support_chunking:
+            log_master(
+                logger.warning,
+                "Prefill chunking is only supported with `flashinfer` or `flashdecoding` attention types.",
+            )
+            support_chunking = False
+
+        log_master(
+            logger.info, f"Using experimental prefill chunking = {support_chunking}"
+        )
+
+        self.support_chunking = support_chunking
+        set_support_chunking(support_chunking)
+
         self.has_position_ids = (
             inspect.signature(model.forward).parameters.get("position_ids", None)
             is not None
@@ -86,6 +111,10 @@ class Model(ABC):
             device_type=self.device.type,
             window_size=self.sliding_window,
             speculate=self.speculate,
+            support_chunking=self.support_chunking,
+            use_prefix_caching=PREFIX_CACHING,
+            attention_impl=ATTENTION,
+            block_size=BLOCK_SIZE,
         )
 
     @property
@@ -141,136 +170,3 @@ class Model(ABC):
             raise RuntimeError(
                 f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
             )
-
-    @property
-    def supports_adapter_loading(self) -> bool:
-        return False
-
-    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
-        return {}
-
-    @property
-    def adapter_layers(self) -> List[str]:
-        return []
-
-    @property
-    def default_traced_adapter_layers(self) -> List[str]:
-        return []
-
-    def get_num_layers_for_type(self, layer_type: str) -> int:
-        return 0
-
-    def is_row_parallel(self, layer_type: str) -> bool:
-        return False
-
-    @property
-    def max_speculative_tokens(self) -> int:
-        return max(
-            [
-                weights.max_speculative_tokens
-                for weights in self.layer_to_adapter_weights.values()
-            ],
-            default=0,
-        )
-
-    def load_adapter(
-        self,
-        adapter_parameters: AdapterParameters,
-        adapter_source: AdapterSource,
-        adapter_index: int,
-        api_token: str,
-        dynamic: bool = True,
-    ):
-        """Loads adapter weights from disk / host memory on the GPU.
-
-        adapter_id must be `BASE_MODEL_ADAPTER_ID` if adapter statically loaded
-        into model. Otherwise, the adapter weights are applied during the forward
-        pass and stored separately from the base model parameters.
-        """
-        if adapter_index in self.loaded_adapters:
-            # Adapter already loaded
-            return
-
-        if not self.supports_adapter_loading:
-            raise ValueError("This model does not support adapter loading.")
-
-        if dynamic and not self.dynamic_adapter_loading_enabled:
-            raise ValueError(
-                f"This model was initialized with the adapter {self.static_adapter_id} "
-                f"and therefore does not support dynamic adapter loading. "
-                f"Please initialize a new model instance from the base model in "
-                f"order to use the dynamic adapter loading feature."
-            )
-
-        logger.info(
-            f"Loading adapter weights into model: {','.join(adapter_parameters.adapter_ids)}"
-        )
-        weight_names = tuple([v[0] for v in self.target_to_layer.values()])
-        (
-            module_map,
-            adapter_config,
-            adapter_weight_names,
-            adapter_tokenizer,
-        ) = load_and_merge_adapters(
-            self.model_id,
-            adapter_parameters,
-            adapter_source,
-            adapter_index,
-            weight_names,
-            api_token,
-            False,
-        )
-
-        unused_weight_names = adapter_weight_names.copy()
-        for layer_name in self.adapter_layers:
-            adapter_weights = adapter_config.load_batched_adapter_weights(
-                self,
-                module_map,
-                layer_name,
-                unused_weight_names,
-                dynamic,
-            )
-
-            if adapter_weights is None:
-                continue
-
-            layer_weights = self.layer_to_adapter_weights[layer_name]
-            layer_weights.add_adapter(adapter_index, adapter_weights)
-
-        if len(unused_weight_names) > 0:
-            logger.warning(
-                f"{','.join(adapter_parameters.adapter_ids)} unused adapter weights: {unused_weight_names}"
-            )
-
-        if adapter_tokenizer is not None:
-            self.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)
-
-        self.loaded_adapters.add(adapter_index)
-
-    def offload_adapter(
-        self,
-        adapter_parameters: AdapterParameters,
-        adapter_source: AdapterSource,
-        adapter_index: int,
-    ):
-        """Offloads the adapter weights from GPU to CPU or disk."""
-        if adapter_index not in self.loaded_adapters:
-            # Adapter already offloaded
-            return
-
-        if not self.supports_adapter_loading:
-            raise ValueError("This model does not support adapter loading.")
-
-        if not self.dynamic_adapter_loading_enabled:
-            raise ValueError(
-                f"This model was initialized with the adapter {self.static_adapter_id} "
-                f"and therefore does not support dynamic adapter loading. "
-                f"Please initialize a new model instance from the base model in "
-                f"order to use the dynamic adapter loading feature."
-            )
-
-        for layer_name in self.adapter_layers:
-            if layer_name in self.layer_to_adapter_weights:
-                self.layer_to_adapter_weights[layer_name].remove_adapter(adapter_index)
-
-        self.loaded_adapters.remove(adapter_index)
diff --git a/server/text_generation_server/models/mpt.py b/server/text_generation_server/models/mpt.py
deleted file mode 100644
index 1e79b25f2632165ca742cf7f0adee25e5bb56909..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/mpt.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import torch
-import torch.distributed
-
-from pathlib import Path
-from typing import Optional, Type
-from opentelemetry import trace
-from transformers import AutoTokenizer, PretrainedConfig, PreTrainedTokenizerBase
-from huggingface_hub import hf_hub_download
-import json
-
-from text_generation_server.models import CausalLM
-from text_generation_server.models.causal_lm import CausalLMBatch
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.custom_modeling.mpt_modeling import (
-    MPTForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class MPTCausalLMBatch(CausalLMBatch):
-    @classmethod
-    def from_pb(
-        cls,
-        pb: generate_pb2.Batch,
-        tokenizer: PreTrainedTokenizerBase,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> "CausalLMBatch":
-        batch = super().from_pb(pb=pb, tokenizer=tokenizer, dtype=dtype, device=device)
-        batch.keys_head_dim_last = False
-        return batch
-
-
-class MPTSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-
-        # If model_id is a local path, load the file directly
-        local_path = Path(model_id, "config.json")
-        if local_path.exists():
-            filename = str(local_path.resolve())
-        else:
-            filename = hf_hub_download(
-                model_id, revision=revision, filename="config.json"
-            )
-        with open(filename, "r") as f:
-            config = json.load(f)
-        config = PretrainedConfig(**config)
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        config.quantize = quantize
-        model = MPTForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=False,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    @property
-    def batch_type(self) -> Type[CausalLMBatch]:
-        return MPTCausalLMBatch
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
deleted file mode 100644
index 6d7d07f59c341b068e84bcb9460f7cc3d70a5ab0..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/opt.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import Optional
-
-from transformers import (
-    AutoTokenizer,
-    AutoConfig,
-)
-from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
-from text_generation_server.models import CausalLM
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-class OPTSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-        tokenizer.pad_token_id = config.pad_token_id
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = OPTForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ):
-        outputs, speculative_logits = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-
-        return outputs.logits, speculative_logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/pali_gemma.py b/server/text_generation_server/models/pali_gemma.py
index a167e4679a5038d7916bbe923146d0f3aff54eb9..fe75570ea609c64f8eb9d5cffe48a8fc7b17de4e 100644
--- a/server/text_generation_server/models/pali_gemma.py
+++ b/server/text_generation_server/models/pali_gemma.py
@@ -3,16 +3,11 @@ from PIL import Image
 import torch
 import torch.distributed
 from opentelemetry import trace
-from typing import Iterable, Optional, Tuple
+from typing import Iterable
 from text_generation_server.models.vlm_causal_lm import (
-    VlmCausalLM,
     VlmCausalLMBatch,
     image_text_replacement,
 )
-from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import (
-    PaliGemmaForConditionalGeneration,
-)
-from transformers import AutoProcessor, AutoConfig
 
 from text_generation_server.pb.generate_pb2 import Request
 
@@ -74,45 +69,3 @@ class PaliGemmaBatch(VlmCausalLMBatch):
         else:
             image_inputs = None
         return batch_tokenized_inputs, image_inputs
-
-
-class PaliGemma(VlmCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.processor = AutoProcessor.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-
-        super().__init__(
-            config_cls=AutoConfig,
-            model_cls=PaliGemmaForConditionalGeneration,
-            model_id=model_id,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
-    @property
-    def batch_type(self):
-        return PaliGemmaBatch
-
-    def get_layer_config(self, model) -> Tuple[int, int, int]:
-        return (
-            len(model.text_model.model.layers),
-            model.text_model.model.num_key_value_heads,
-            model.text_model.model.head_size,
-        )
-
-    def max_past(self) -> Optional[int]:
-        return getattr(self.model.text_model, "max_past", None)
diff --git a/server/text_generation_server/models/phi.py b/server/text_generation_server/models/phi.py
deleted file mode 100644
index 93d42b2b8dc011c522063b5370540d9548821fdc..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/phi.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import torch
-import torch.distributed
-
-from transformers import AutoConfig, AutoTokenizer
-from typing import Optional, List, Tuple
-
-from text_generation_server.models import CausalLM
-from text_generation_server.models.custom_modeling.phi_modeling import (
-    PhiConfig,
-    PhiForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-class Phi(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, _rank, _world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            if quantize:
-                raise ValueError("quantization is not available on CPU")
-
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        config = PhiConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-
-        tokenizer.bos_token_id = config.bos_token_id
-        tokenizer.eos_token_id = config.eos_token_id
-        tokenizer.pad_token = tokenizer.eos_token
-
-        config.quantize = quantize
-        config.speculator = speculator
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        model = PhiForCausalLM(config, weights)
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-        )
diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py
deleted file mode 100644
index 37ca277b7e0d19e487401326bf7a78cadc148286..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/rw.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import torch
-
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from typing import List, Optional, Tuple
-
-from text_generation_server.models import CausalLM
-
-
-class RW(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        if speculator:
-            raise RuntimeError("Medusa decoding is not enabled for AutoModel")
-
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            if quantize:
-                raise ValueError("quantization is not available on CPU")
-
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            revision=revision,
-            torch_dtype=dtype,
-            device_map=(
-                "auto"
-                if torch.cuda.is_available() and torch.cuda.device_count() > 1
-                else None
-            ),
-            load_in_8bit=quantize == "bitsandbytes",
-            trust_remote_code=trust_remote_code,
-        )
-        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
-            model = model.cuda()
-
-        if tokenizer.pad_token_id is None:
-            if model.config.pad_token_id is not None:
-                tokenizer.pad_token_id = model.config.pad_token_id
-            elif model.config.eos_token_id is not None:
-                tokenizer.pad_token_id = model.config.eos_token_id
-            elif tokenizer.eos_token_id is not None:
-                tokenizer.pad_token_id = tokenizer.eos_token_id
-            else:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ):
-        # Model Forward
-        outputs, speculative_logits = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-
-        return outputs.logits, speculative_logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
deleted file mode 100644
index caddbe191b314fdd4d06112c894fc0ef218bd11e..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/santacoder.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import Optional, List
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-from text_generation_server.models import CausalLM
-
-FIM_PREFIX = "<fim-prefix>"
-FIM_MIDDLE = "<fim-middle>"
-FIM_SUFFIX = "<fim-suffix>"
-FIM_PAD = "<fim-pad>"
-EOD = "<|endoftext|>"
-
-
-class SantaCoder(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            if quantize:
-                raise ValueError("quantization is not available on CPU")
-
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.add_special_tokens(
-            {
-                "additional_special_tokens": [
-                    EOD,
-                    FIM_PREFIX,
-                    FIM_MIDDLE,
-                    FIM_SUFFIX,
-                    FIM_PAD,
-                ],
-                "pad_token": EOD,
-            }
-        )
-        with device:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                revision=revision,
-                torch_dtype=dtype,
-                load_in_8bit=quantize == "bitsandbytes",
-                trust_remote_code=trust_remote_code,
-            )
-
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-        )
-
-    def decode(self, generated_ids: List[int]) -> str:
-        # Do not skip special tokens as they are used for custom parsing rules of the generated text
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
-        )
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index d454d80477a440e5626f4fc22eb446a934501e09..3880a438c07df301a014ccccf8f81c2315a1cb9c 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -1,12 +1,23 @@
 import torch
+import torch.distributed
 import time
-
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PreTrainedTokenizerBase
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    PreTrainedTokenizerBase,
+    AutoConfig,
+)
 from typing import Optional, Tuple, List, Type, Dict
-
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
 from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.quantization import get_loader
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.models import Model
 from text_generation_server.models.types import (
@@ -69,6 +80,7 @@ class Seq2SeqLMBatch(Batch):
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.max_tokens,
+            current_tokens=len(self.decoder_input_ids),
         )
 
     @classmethod
@@ -242,7 +254,7 @@ class Seq2SeqLMBatch(Batch):
         ]
 
         # Ensure that past_key_values tensors can be updated in-place
-        if type(self.past_key_values[0]) == tuple:
+        if type(self.past_key_values[0]) is tuple:
             self.past_key_values = [
                 [t for t in layer] for layer in self.past_key_values
             ]
@@ -418,7 +430,7 @@ class Seq2SeqLMBatch(Batch):
             batch.encoder_last_hidden_state = None
 
             # Ensure that we can update tensors in-place
-            if type(batch.past_key_values[0]) == tuple:
+            if isinstance(batch.past_key_values[0], tuple):
                 batch.past_key_values = [
                     [t for t in layer] for layer in batch.past_key_values
                 ]
@@ -531,6 +543,84 @@ class Seq2SeqLM(Model):
     def __init__(
         self,
         model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        config_class=AutoConfig,
+        tokenizer_class=AutoTokenizer,
+        aliases=None,
+    ):
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device(f"xpu:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            device = torch.device("cpu")
+            # Float16 doesn't exist on target.
+            dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        config = config_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        tokenizer.bos_token_id = config.decoder_start_token_id
+
+        weights_loader = get_loader(
+            quantize=quantize, model_id=model_id, revision=revision
+        )
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device=device,
+            dtype=dtype,
+            process_group=self.process_group,
+            aliases=aliases,
+            weights_loader=weights_loader,
+        )
+        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
+            weights._set_gptq_params(model_id, revision)
+
+        model = model_class(config, weights)
+
+        torch.distributed.barrier(group=self.process_group)
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
         speculator: Optional[str] = None,
@@ -540,8 +630,14 @@ class Seq2SeqLM(Model):
         if speculator:
             raise RuntimeError("Speculator decoding is not enabled for AutoModel")
 
+        device_count = 0
         if torch.cuda.is_available():
             device = torch.device("cuda")
+            device_count = torch.cuda.device_count()
+            dtype = torch.float16 if dtype is None else dtype
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device("xpu")
+            device_count = torch.xpu.device_count()
             dtype = torch.float16 if dtype is None else dtype
         else:
             if quantize:
@@ -554,16 +650,12 @@ class Seq2SeqLM(Model):
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map=(
-                "auto"
-                if torch.cuda.is_available() and torch.cuda.device_count() > 1
-                else None
-            ),
+            device_map=("auto" if device_count > 1 else None),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
-        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
-            model = model.cuda()
+        if device_count == 1:
+            model = model.to(device)
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
@@ -574,7 +666,11 @@ class Seq2SeqLM(Model):
         )
         tokenizer.bos_token_id = model.config.decoder_start_token_id
 
-        super(Seq2SeqLM, self).__init__(
+        self = cls.__new__(
+            cls,
+        )
+        super().__init__(
+            self,
             model_id=model_id,
             model=model,
             tokenizer=tokenizer,
@@ -582,16 +678,13 @@ class Seq2SeqLM(Model):
             dtype=dtype,
             device=device,
         )
+        self.quantize = quantize
+        return self
 
     @property
     def batch_type(self) -> Type[Seq2SeqLMBatch]:
         return Seq2SeqLMBatch
 
-    def decode(self, decoder_ids: List[int]) -> str:
-        return self.tokenizer.decode(
-            decoder_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-
     def forward(
         self,
         input_ids,
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
deleted file mode 100644
index adef664c75e234119b42f22d3fd2bdec35608298..0000000000000000000000000000000000000000
--- a/server/text_generation_server/models/t5.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import List, Optional, Tuple
-
-from transformers import (
-    AutoTokenizer,
-    AutoConfig,
-)
-
-from text_generation_server.models import Seq2SeqLM
-from text_generation_server.models.custom_modeling.t5_modeling import (
-    T5ForConditionalGeneration,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-class T5Sharded(Seq2SeqLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.bos_token_id = config.decoder_start_token_id
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device=device,
-            dtype=dtype,
-            process_group=self.process_group,
-            aliases={
-                "shared.weight": [
-                    "encoder.embed_tokens.weight",
-                    "decoder.embed_tokens.weight",
-                ]
-            },
-        )
-
-        model = T5ForConditionalGeneration(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(Seq2SeqLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    def forward(
-        self,
-        input_ids,
-        attention_mask,
-        decoder_input_ids,
-        decoder_attention_mask: Optional,
-        encoder_last_hidden_state: Optional,
-        past_key_values: Optional = None,
-    ) -> Tuple[
-        torch.Tensor,
-        torch.Tensor,
-        List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
-    ]:
-        # Model Forward
-        outputs, speculative_logits = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_last_hidden_state,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-
-        return (
-            outputs.logits,
-            speculative_logits,
-            outputs.encoder_last_hidden_state,
-            outputs.past_key_values,
-        )
diff --git a/server/text_generation_server/models/types.py b/server/text_generation_server/models/types.py
index 339b733b5f60be1a14deff6f13cd74046274085c..ed9ae98959cb8848bdca6edd2a3f5b02fdef7b22 100644
--- a/server/text_generation_server/models/types.py
+++ b/server/text_generation_server/models/types.py
@@ -1,4 +1,3 @@
-from functools import total_ordering
 import torch
 
 from abc import ABC, abstractmethod
@@ -75,6 +74,14 @@ class Tokens:
     def __len__(self):
         return len(self.token_ids)
 
+    def __add__(self, other: "Tokens") -> "Tokens":
+        return Tokens(
+            self.token_ids + other.token_ids,
+            self.logprobs + other.logprobs,
+            self.texts + other.texts,
+            self.is_special + other.is_special,
+        )
+
 
 @dataclass
 class Generation:
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 1cdf37ea6b0454bc2eb8d7554dbd5f6c0ccf246a..4bbddcfb4cde22c5c17342a00910293b3ac25e18 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -1,4 +1,3 @@
-from itertools import repeat
 import torch
 from PIL import Image
 from io import BytesIO
@@ -9,10 +8,15 @@ from typing import Iterable, Optional, Tuple, List, Type, Dict
 from transformers import PreTrainedTokenizerBase
 from transformers.image_processing_utils import select_best_resolution
 from text_generation_server.pb import generate_pb2
-from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch
-from text_generation_server.models.flash_mistral import (
-    BaseFlashMistral,
+from text_generation_server.models.flash_causal_lm import (
+    FlashCausalLMBatch,
+    FlashCausalLM,
 )
+from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
+from text_generation_server.utils.log import log_master
+from transformers import AutoProcessor
+from text_generation_server.layers.attention import Seqlen
+from text_generation_server.models.metadata_kernels import block_tables_to_ragged
 
 tracer = trace.get_tracer(__name__)
 
@@ -55,8 +59,9 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
         num_features = get_number_of_features(height, width, config)
         from loguru import logger
 
-        logger.info(
-            f"Found {num_features} features in image of resolution {height}x{width}"
+        log_master(
+            logger.info,
+            f"Found {num_features} features in image of resolution {height}x{width}",
         )
         return "<image>" * num_features
 
@@ -239,10 +244,44 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
         return batch
 
 
-class VlmCausalLM(BaseFlashMistral):
+class VlmCausalLM(FlashCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        *,
+        processor_class=AutoProcessor,
+        processor_kwargs=None,
+        batch_class=VlmCausalLMBatch,
+        revision,
+        trust_remote_code: bool,
+        **kwargs,
+    ):
+        if PREFIX_CACHING:
+            raise NotImplementedError("Vlm do not work with prefix caching yet")
+        if processor_kwargs is None:
+            processor_kwargs = {}
+        self.processor = processor_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **processor_kwargs,
+        )
+        self.batch_class = batch_class
+        super().__init__(
+            model_id=model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            # FIXME: VLM do not work with context chunking yet
+            support_chunking=False,
+            **kwargs,
+        )
+
     @property
     def batch_type(self) -> Type[VlmCausalLMBatch]:
-        return VlmCausalLMBatch
+        return self.batch_class
+
+    def max_past(self) -> Optional[int]:
+        return getattr(self.model.text_model, "max_past", None)
 
     def forward(
         self,
@@ -258,7 +297,7 @@ class VlmCausalLM(BaseFlashMistral):
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
             speculative_ids = batch.speculative_ids
@@ -277,6 +316,9 @@ class VlmCausalLM(BaseFlashMistral):
             input_lengths = (
                 input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
             ).view(-1)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
+            ).reshape(-1)
 
             # Add Copy the block tables for all members
             block_tables = (
@@ -297,7 +339,8 @@ class VlmCausalLM(BaseFlashMistral):
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
         if cu_seqlen_prefill is None and self.max_past() is not None:
@@ -306,7 +349,6 @@ class VlmCausalLM(BaseFlashMistral):
             # This makes sure the max_s for the decode pass is correct.
             max_s = min(self.max_past(), max_s)
 
-        bs = input_ids.shape[0]
         # Try to find an associated cuda graph
         bs = input_ids.shape[0]
         sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
@@ -316,45 +358,92 @@ class VlmCausalLM(BaseFlashMistral):
         else:
             cuda_graph = None
         if cu_seqlen_prefill is not None or cuda_graph is None:
-            logits, speculative_logits = self.model.forward(
-                input_ids=input_ids,
-                position_ids=position_ids,
-                cu_seqlen_prefill=cu_seqlen_prefill,
-                kv_cache=kv_cache,
+            if ATTENTION == "flashinfer":
+                block_tables = block_tables_to_ragged(
+                    block_tables=block_tables,
+                    input_lengths=batch.input_lengths,
+                    cache_lengths=batch.cache_lengths,
+                    input_lengths_tensor=batch.input_lengths_tensor,
+                    cache_lengths_tensor=batch.cache_lengths_tensor,
+                    max_current_length=batch.max_current_length,
+                )
+            with self._forward_context(
                 block_tables=block_tables,
-                slots=slots,
-                input_lengths=input_lengths,
-                max_s=max_s,
-                prefill_cache_indices=batch.prefill_cache_indices,
-                lm_head_indices=lm_head_indices,
-                pixel_values=batch.pixel_values,
-                pixel_attention_mask=batch.pixel_attention_mask,
-                image_sizes=batch.image_sizes,
-            )
-            if batch.prefill_cache_indices is not None:
-                batch.prefill_cache_indices = None
-            if batch.pixel_values is not None:
-                batch.pixel_values = None
-            if batch.pixel_attention_mask is not None:
-                batch.pixel_attention_mask = None
-            if batch.image_sizes is not None:
-                batch.image_sizes = None
-            return logits, speculative_logits
+                cu_seqlen_prefill=cu_seqlen_prefill,
+                input_lengths_tensor=input_lengths,
+                cache_lengths_tensor=cache_lengths_tensor,
+            ):
+                seqlen = Seqlen(
+                    input_lengths=input_lengths,
+                    cache_lengths=cache_lengths_tensor,
+                    cu_seqlen_q=cu_seqlen_prefill,
+                    max_q=batch.max_input_length,
+                    max_k=batch.max_current_length,
+                )
+                logits, speculative_logits = self.model.forward(
+                    input_ids=input_ids,
+                    position_ids=position_ids,
+                    cu_seqlen_prefill=cu_seqlen_prefill,
+                    kv_cache=kv_cache,
+                    block_tables=block_tables,
+                    slots=slots,
+                    seqlen=seqlen,
+                    max_s=max_s,
+                    prefill_cache_indices=batch.prefill_cache_indices,
+                    lm_head_indices=lm_head_indices,
+                    pixel_values=batch.pixel_values,
+                    pixel_attention_mask=batch.pixel_attention_mask,
+                    image_sizes=batch.image_sizes,
+                )
+                if batch.prefill_cache_indices is not None:
+                    batch.prefill_cache_indices = None
+                if batch.pixel_values is not None:
+                    batch.pixel_values = None
+                if batch.pixel_attention_mask is not None:
+                    batch.pixel_attention_mask = None
+                if batch.image_sizes is not None:
+                    batch.image_sizes = None
+                return logits, speculative_logits
 
         # Copy inputs to the static inputs of the cuda graph
         # Static inputs are potentially padded
         cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
         cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
-        cuda_graph["block_tables"][
-            : block_tables.shape[0], : block_tables.shape[1]
-        ] = block_tables
-        cuda_graph["slots"].fill_(-1)
+        if ATTENTION == "flashinfer":
+            block_tables = block_tables_to_ragged(
+                block_tables=block_tables,
+                input_lengths=batch.input_lengths,
+                cache_lengths=batch.cache_lengths,
+                input_lengths_tensor=batch.input_lengths_tensor,
+                cache_lengths_tensor=batch.cache_lengths_tensor,
+                max_current_length=batch.max_current_length,
+            )
+            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
+        else:
+            cuda_graph["block_tables"][
+                : block_tables.shape[0], : block_tables.shape[1]
+            ] = block_tables
+
+        # XXX: This is working only because block 0 is reserved for the healthcheck
+        # so it doesn't matter if we override it with bogus values.
+        cuda_graph["slots"].fill_(0)
         cuda_graph["slots"][: slots.shape[0]] = slots
         cuda_graph["input_lengths"].zero_()
         cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
-
-        # Replay the graph
-        cuda_graph["graph"].replay()
+        cuda_graph["cache_lengths"].zero_()
+        cuda_graph["cache_lengths"][
+            : cache_lengths_tensor.shape[0]
+        ] = cache_lengths_tensor
+
+        with self._forward_context(
+            block_tables=cuda_graph["block_tables"],
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=cuda_graph["input_lengths"],
+            cache_lengths_tensor=cuda_graph["cache_lengths"],
+            state=cuda_graph["state"],
+        ):
+            # Replay the graph
+            cuda_graph["graph"].replay()
 
         # Slice output to the correct shape
         speculative_logits = (
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index aee287c678b63253aa57204c34a03b1502ca163b..aef00fb5f5ddaac8fa663a4a964c8f0fc7654a75 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -13,7 +13,9 @@ from typing import List, Optional
 
 from text_generation_server.cache import Cache
 from text_generation_server.interceptor import ExceptionInterceptor
-from text_generation_server.models import Model, get_model
+from text_generation_server.models import Model, get_model_with_lora_adapters
+from text_generation_server.utils.adapter import AdapterInfo
+from text_generation_server.utils.prefill_chunking import set_max_prefill_tokens
 
 try:
     from text_generation_server.models.pali_gemma import PaliGemmaBatch
@@ -21,18 +23,21 @@ try:
         VlmCausalLMBatch,
     )
     from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
+    from text_generation_server.models.mllama_causal_lm import MllamaCausalLMBatch
 
-    VLM_BATCH_TYPES = {PaliGemmaBatch, VlmCausalLMBatch, IdeficsCausalLMBatch}
+    VLM_BATCH_TYPES = {
+        PaliGemmaBatch,
+        VlmCausalLMBatch,
+        IdeficsCausalLMBatch,
+        MllamaCausalLMBatch,
+    }
 except (ImportError, NotImplementedError):
     # These imports can fail on CPU/Non flash.
     VLM_BATCH_TYPES = set()
 
 from text_generation_server.pb import generate_pb2_grpc, generate_pb2
 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
-from text_generation_server.models.globals import set_model_id, set_adapter_to_index
-from text_generation_server.utils.adapter import (
-    AdapterParameters,
-)
+from text_generation_server.models.globals import set_adapter_to_index
 
 
 class SignalHandler:
@@ -42,9 +47,12 @@ class SignalHandler:
         signal.signal(signal.SIGINT, self.exit_gracefully)
         signal.signal(signal.SIGTERM, self.exit_gracefully)
 
+    def set_keep_processing(self, value: bool):
+        self.KEEP_PROCESSING = value
+
     def exit_gracefully(self, signum, frame):
         print(f"Exiting gracefully: Signal {signum}")
-        self.KEEP_PROCESSING = False
+        self.set_keep_processing(False)
 
 
 class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
@@ -52,12 +60,12 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
         self,
         model: Model,
         cache: Cache,
-        quantize: Optional[str],
         server_urls: List[str],
     ):
         self.cache = cache
         self.model = model
-        self.quantize = quantize
+        # Quantize is resolved during model loading
+        self.quantize = model.quantize
         self.server_urls = server_urls
         # For some reason, inference_mode does not work well with GLOO which we use on CPU
         if model.device.type == "cuda":
@@ -92,6 +100,8 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
         return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
 
     async def Warmup(self, request, context):
+        set_max_prefill_tokens(request.max_prefill_tokens)
+
         if self.quantize in {"exl2", "gptq"}:
             try:
                 # When using GPTQ, Exllama kernels need some global kernels
@@ -146,6 +156,18 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
                 request.batch, self.model.tokenizer, self.model.dtype, self.model.device
             )
 
+        concat_ns = None
+        if self.model.support_chunking:
+            if request.HasField("cached_batch"):
+                cached_batch = self.cache.pop(request.cached_batch.id)
+                if cached_batch is None:
+                    raise ValueError(
+                        f"Batch ID {request.cached_batch.id} not found in cache."
+                    )
+                start_concat = time.time_ns()
+                batch = self.model.batch_type.concatenate([cached_batch, batch])
+                concat_ns = time.time_ns() - start_concat
+
         generations, next_batch, timings = self.model.generate_token(batch)
         self.cache.set(next_batch)
 
@@ -155,6 +177,7 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
             forward_ns=timings[0],
             decode_ns=timings[1],
             total_ns=time.time_ns() - start,
+            concat_ns=concat_ns,
         )
 
     async def Decode(self, request, context):
@@ -195,24 +218,26 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
 
 def serve(
     model_id: str,
-    lora_adapter_ids: Optional[List[str]],
+    lora_adapters: Optional[List[AdapterInfo]],
     revision: Optional[str],
     sharded: bool,
     quantize: Optional[str],
     speculate: Optional[int],
     dtype: Optional[str],
+    kv_cache_dtype: Optional[str],
     trust_remote_code: bool,
     uds_path: Path,
     max_input_tokens: int,
 ):
     async def serve_inner(
         model_id: str,
-        lora_adapter_ids: Optional[List[str]],
+        lora_adapters: Optional[List[AdapterInfo]],
         revision: Optional[str],
         sharded: bool = False,
         quantize: Optional[str] = None,
         speculate: Optional[int] = None,
         dtype: Optional[str] = None,
+        kv_cache_dtype: Optional[str] = None,
         trust_remote_code: bool = False,
     ):
         unix_socket_template = "unix://{}-{}"
@@ -228,47 +253,30 @@ def serve(
             server_urls = [local_url]
 
         try:
-            model = get_model(
+            model = get_model_with_lora_adapters(
                 model_id,
-                lora_adapter_ids,
+                lora_adapters,
                 revision,
                 sharded,
                 quantize,
                 speculate,
                 dtype,
+                kv_cache_dtype,
                 trust_remote_code,
                 max_input_tokens,
+                adapter_to_index,
             )
 
-            if len(lora_adapter_ids) > 0:
-                for index, adapter_id in enumerate(lora_adapter_ids):
-                    # TODO: improve non merged adapter loading and long term
-                    # improve adapter loading as a whole
-                    adapter_parameters = AdapterParameters(
-                        adapter_ids=[adapter_id],
-                        weights=None,  #  will be set to 1
-                        merge_strategy=0,
-                        density=1.0,
-                        majority_sign_method=0,
-                    )
-                    adapter_index = index + 1
-                    adapter_to_index[adapter_id] = adapter_index
-                    model.load_adapter(
-                        adapter_parameters,
-                        None,  # adapter_source
-                        adapter_index,
-                        None,  # api_token
-                        False,  # dynamic
-                    )
-
         except Exception:
             logger.exception("Error when initializing model")
             raise
 
+        signal_handler = SignalHandler()
+
         set_adapter_to_index(adapter_to_index)
         server = aio.server(
             interceptors=[
-                ExceptionInterceptor(),
+                ExceptionInterceptor(lambda: signal_handler.set_keep_processing(False)),
                 UDSOpenTelemetryAioServerInterceptor(),
             ],
             options=[
@@ -277,7 +285,7 @@ def serve(
             ],
         )
         generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
-            TextGenerationService(model, Cache(), quantize, server_urls), server
+            TextGenerationService(model, Cache(), server_urls), server
         )
         SERVICE_NAMES = (
             generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
@@ -289,20 +297,19 @@ def serve(
         await server.start()
 
         logger.info("Server started at {}".format(local_url))
-        signal_handler = SignalHandler()
         while signal_handler.KEEP_PROCESSING:
             await asyncio.sleep(0.5)
 
-    set_model_id(model_id)
     asyncio.run(
         serve_inner(
             model_id,
-            lora_adapter_ids,
+            lora_adapters,
             revision,
             sharded,
             quantize,
             speculate,
             dtype,
+            kv_cache_dtype,
             trust_remote_code,
         )
     )
diff --git a/server/text_generation_server/utils/adapter.py b/server/text_generation_server/utils/adapter.py
index 4e2492de0ecf693b911606afb95be2d2a8bb0705..09254b68a4349aee5c28201b87e1ebb5ecc4a812 100644
--- a/server/text_generation_server/utils/adapter.py
+++ b/server/text_generation_server/utils/adapter.py
@@ -3,14 +3,14 @@
 # License:  Apache License Version 2.0, January 2004
 
 import warnings
+import re
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import TYPE_CHECKING, Set, Tuple
+from typing import TYPE_CHECKING, Set, Tuple, Optional, List
 
 from safetensors.torch import load_file
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
 
-from text_generation_server.pb import generate_pb2
 from text_generation_server.utils.merges.strategies import merge_adapters
 
 from text_generation_server.utils import hub
@@ -24,9 +24,16 @@ if TYPE_CHECKING:
 BASE_MODEL_ADAPTER_ID = "__base_model__"
 
 
+@dataclass
+class AdapterInfo:
+    id: str
+    path: Optional[str]
+    revision: Optional[str] = None
+
+
 @dataclass
 class AdapterParameters:
-    adapter_ids: Tuple[str]
+    adapter_info: Tuple[AdapterInfo]
     weights: Tuple[float]
     merge_strategy: NotImplemented
     density: float
@@ -40,37 +47,57 @@ class AdapterSource:
     revision: str
 
 
+def parse_lora_adapters(lora_adapters: Optional[str]) -> List[AdapterInfo]:
+    if not lora_adapters:
+        return []
+
+    adapter_list = []
+    for adapter in lora_adapters.split(","):
+        adapter = adapter.strip()
+        if adapter.count("=") > 1 or adapter.count("@") > 1:
+            raise ValueError(f"Invalid LoRA adapter format: {adapter}")
+        match = re.match(r"^([^=@]+)(?:=([^@]+))?(?:@(.+))?$", adapter)
+
+        if match:
+            adapter_id, path, revision = match.groups()
+            adapter_list.append(
+                AdapterInfo(id=adapter_id, path=path, revision=revision)
+            )
+        else:
+            raise ValueError(f"Invalid LoRA adapter format: {adapter}")
+    return adapter_list
+
+
 def load_and_merge_adapters(
     model_id: str,
     adapter_parameters: AdapterParameters,
-    adapter_source: str,
     adapter_index: int,
     weight_names: Tuple[str],
-    api_token: str,
     trust_remote_code: bool = False,
 ) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
-    if len(adapter_parameters.adapter_ids) == 1:
+    if len(adapter_parameters.adapter_info) == 1:
+        adapter = next(iter(adapter_parameters.adapter_info))
         return load_module_map(
             model_id,
-            adapter_parameters.adapter_ids[0],
-            adapter_source,
+            adapter.revision,
+            adapter.id,
+            adapter.path,
             weight_names,
-            api_token,
             trust_remote_code,
         )
 
-    adapter_params = AdapterParametersContainer(
-        adapter_parameters, adapter_source, adapter_index
-    )
+    adapter_params = AdapterParametersContainer(adapter_parameters, adapter_index)
     return _load_and_merge(
-        model_id, adapter_params, weight_names, api_token, trust_remote_code
+        model_id,
+        adapter_params,
+        weight_names,
+        trust_remote_code,
     )
 
 
 @dataclass
 class AdapterParametersContainer:
     adapter_parameters: AdapterParameters
-    adapter_source: str
     adapter_index: int
 
     def __hash__(self) -> int:
@@ -82,7 +109,6 @@ def _load_and_merge(
     model_id: str,
     adapter_params: AdapterParametersContainer,
     weight_names: Tuple[str],
-    api_token: str,
     trust_remote_code: bool = False,
 ) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
     params = adapter_params.adapter_parameters
@@ -90,19 +116,22 @@ def _load_and_merge(
     adapters_to_merge = []
     merged_weight_names = set()
     tokenizer = None
-    for adapter_id in params.adapter_ids:
-        if adapter_id == BASE_MODEL_ADAPTER_ID:
+    for adapter in params.adapter_info:
+        if adapter.id == BASE_MODEL_ADAPTER_ID:
             raise ValueError("Base model adapter cannot be merged.")
 
-        module_map, adapter_config, adapter_weight_names, adapter_tokenizer = (
-            load_module_map(
-                model_id,
-                adapter_id,
-                adapter_params.adapter_source,
-                weight_names,
-                api_token,
-                trust_remote_code,
-            )
+        (
+            module_map,
+            adapter_config,
+            adapter_weight_names,
+            adapter_tokenizer,
+        ) = load_module_map(
+            model_id,
+            adapter.revision,
+            adapter.id,
+            adapter.path,
+            weight_names,
+            trust_remote_code,
         )
 
         adapters_to_merge.append((module_map, adapter_config))
@@ -158,26 +187,34 @@ def check_architectures(
 @lru_cache(maxsize=128)
 def load_module_map(
     model_id: str,
+    revision: str,
     adapter_id: str,
-    adapter_source: str,
+    adapter_path: Optional[str],
     weight_names: Tuple[str],
-    api_token: str,
     trust_remote_code: bool = False,
 ) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
-    revision = "main"
+    adapter_config = LoraConfig.load(adapter_path or adapter_id, None)
 
-    adapter_config = LoraConfig.load(adapter_id, api_token)
-    if adapter_config.base_model_name_or_path != model_id:
+    if not adapter_path and adapter_config.base_model_name_or_path != model_id:
         check_architectures(model_id, adapter_id, adapter_config, trust_remote_code)
 
-    adapter_filenames = hub._cached_adapter_weight_files(
-        adapter_id, revision=revision, extension=".safetensors"
+    adapter_filenames = (
+        hub._weight_files_from_dir(adapter_path, extension=".safetensors")
+        if adapter_path
+        else hub._cached_weight_files(
+            adapter_id, revision=revision, extension=".safetensors"
+        )
     )
 
+    # throw an error if no adapter weights are found
+    if not adapter_filenames:
+        raise FileNotFoundError(
+            f"No adapter weights found for adapter '{adapter_id}' and revision '{revision}'."
+        )
+
     try:
         adapter_tokenizer = AutoTokenizer.from_pretrained(
             adapter_config.config_path,
-            token=api_token,
             trust_remote_code=trust_remote_code,
         )
     except Exception:
@@ -194,3 +231,93 @@ def load_module_map(
         adapter_weights, weight_names
     )
     return module_map, adapter_config, adapter_weight_names, adapter_tokenizer
+
+
+def get_attn_weights(i, layer):
+    qkv = layer.self_attn.query_key_value
+    weights = {}
+
+    for k in ["q", "k", "v"]:
+        key = (i, f"{k}_proj")
+        value = (f"model.layers.{i}.self_attn.{k}_proj", qkv)
+        weights[key] = value
+
+    # also add the qkv_proj weight for the adapter
+    weights[(i, "qkv_proj")] = (
+        f"model.layers.{i}.self_attn.qkv_proj",
+        qkv,
+    )
+
+    weights[(i, "o_proj")] = (
+        f"model.layers.{i}.self_attn.o_proj",
+        layer.self_attn.o_proj,
+    )
+
+    return weights
+
+
+def get_mlp_weights(i, layer):
+    weights = {}
+    if hasattr(layer, "mlp"):
+        mlp = layer.mlp
+        if hasattr(mlp, "gate_up_proj"):
+            # handle combined gate_up_proj (e.g., for some LLaMA variants)
+            weights.update(
+                {
+                    (i, "gate_proj"): (
+                        f"model.layers.{i}.mlp.gate_proj",
+                        mlp.gate_up_proj,
+                    ),
+                    (i, "up_proj"): (f"model.layers.{i}.mlp.up_proj", mlp.gate_up_proj),
+                }
+            )
+        else:
+            # handle separate gate_proj, up_proj, and down_proj (e.g., for Gemma)
+            if hasattr(mlp, "gate_proj"):
+                weights[(i, "gate_proj")] = (
+                    f"model.layers.{i}.mlp.gate_proj",
+                    mlp.gate_proj,
+                )
+            if hasattr(mlp, "up_proj"):
+                weights[(i, "up_proj")] = (f"model.layers.{i}.mlp.up_proj", mlp.up_proj)
+
+        if hasattr(mlp, "down_proj"):
+            weights[(i, "down_proj")] = (
+                f"model.layers.{i}.mlp.down_proj",
+                mlp.down_proj,
+            )
+
+    return weights
+
+
+# build_layer_weight_lookup creates a mapping of model layers to their corresponding
+# weight tensors and paths. It builds a dictionary that maps layer identifiers to tuples
+# containing the weight tensor path and the actual layer object. This mapping is needed
+# for the lora adapter to know which weights to update when applying the adapter.
+def build_layer_weight_lookup(model):
+    if hasattr(model, "language_model"):
+        m = model.language_model.model
+    elif hasattr(model, "text_model"):
+        m = model.text_model.model
+    else:
+        m = model.model
+
+    layer_weights = {}
+
+    for i, layer in enumerate(m.layers):
+        attn_weights = get_attn_weights(i, layer)
+        mlp_weights = get_mlp_weights(i, layer)
+
+        layer_weights.update(attn_weights)
+        layer_weights.update(mlp_weights)
+
+    lm_head = None
+    if hasattr(m, "lm_head"):
+        lm_head = m.lm_head
+    elif hasattr(model, "lm_head"):
+        lm_head = model.lm_head
+
+    if lm_head:
+        layer_weights[(0, "lm_head")] = ("lm_head", lm_head)
+
+    return layer_weights
diff --git a/server/text_generation_server/utils/dist.py b/server/text_generation_server/utils/dist.py
index 36d63e86d2cc2571f26212158ad5afd2bd769bf5..82aeba6ce9fc0e56ca84f62788c09d73557e1d3c 100644
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@@ -56,7 +56,7 @@ def initialize_torch_distributed():
         backend = "nccl"
         options = ProcessGroupNCCL.Options()
         options.is_high_priority_stream = True
-        options._timeout = timedelta(seconds=60)
+        options._timeout = timedelta(seconds=120)
     else:
         backend = "gloo"
         options = None
@@ -76,7 +76,7 @@ def initialize_torch_distributed():
                     backend="ccl",
                     world_size=WORLD_SIZE,
                     rank=RANK,
-                    timeout=timedelta(seconds=60),
+                    timeout=timedelta(seconds=120),
                     pg_options=options,
                 )
             else:
@@ -84,7 +84,7 @@ def initialize_torch_distributed():
                     backend=backend,
                     world_size=WORLD_SIZE,
                     rank=RANK,
-                    timeout=timedelta(seconds=60),
+                    timeout=timedelta(seconds=120),
                     pg_options=options,
                 )
         else:
diff --git a/server/text_generation_server/utils/hub.py b/server/text_generation_server/utils/hub.py
index db412aeb0b1e905779af551d843929d45bcdf62d..f9c476ac3cc1cf0d2e82a34708b6131c6899364b 100644
--- a/server/text_generation_server/utils/hub.py
+++ b/server/text_generation_server/utils/hub.py
@@ -18,17 +18,6 @@ WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
 HF_HUB_OFFLINE = os.environ.get("HF_HUB_OFFLINE", "0").lower() in ["true", "1", "yes"]
 
 
-def _cached_adapter_weight_files(
-    adapter_id: str, revision: Optional[str], extension: str
-) -> List[str]:
-    """Guess weight files from the cached revision snapshot directory"""
-    d = _get_cached_revision_directory(adapter_id, revision)
-    if not d:
-        return []
-    filenames = _adapter_weight_files_from_dir(d, extension)
-    return filenames
-
-
 def _cached_weight_files(
     model_id: str, revision: Optional[str], extension: str
 ) -> List[str]:
@@ -65,39 +54,11 @@ def _weight_files_from_dir(d: Path, extension: str) -> List[str]:
         if f.endswith(extension)
         and "arguments" not in f
         and "args" not in f
-        and "adapter" not in f
         and "training" not in f
     ]
     return filenames
 
 
-def _adapter_weight_files_from_dir(d: Path, extension: str) -> List[str]:
-    # os.walk: do not iterate, just scan for depth 1, not recursively
-    # see _weight_files_from_dir, that's also what is done there
-    root, _, files = next(os.walk(str(d)))
-    filenames = [
-        os.path.join(root, f)
-        for f in files
-        if f.endswith(extension)
-        and "arguments" not in f
-        and "args" not in f
-        and "training" not in f
-    ]
-    return filenames
-
-
-def _adapter_config_files_from_dir(d: Path) -> List[str]:
-    # os.walk: do not iterate, just scan for depth 1, not recursively
-    # see _weight_files_from_dir, that's also what is done there
-    root, _, files = next(os.walk(str(d)))
-    filenames = [
-        os.path.join(root, f)
-        for f in files
-        if f.endswith(".json") and "arguments" not in f and "args" not in f
-    ]
-    return filenames
-
-
 def _get_cached_revision_directory(
     model_id: str, revision: Optional[str]
 ) -> Optional[Path]:
diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py
index 011e0f635cdb5f47cdceb09584e1e64fcc02fc03..b693258c84d5f3a332e3e4c11e8f6e4ca94aec66 100644
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@@ -1,15 +1,13 @@
 import torch
 from loguru import logger
-import subprocess
 import os
 
 
+import importlib.util
+
+
 def is_ipex_available():
-    try:
-        import intel_extension_for_pytorch
-    except ImportError:
-        return False
-    return True
+    return importlib.util.find_spec("intel_extension_for_pytorch") is not None
 
 
 def get_cuda_free_memory(device, memory_fraction):
@@ -58,6 +56,8 @@ elif torch.version.cuda is not None and torch.cuda.is_available():
     get_free_memory = get_cuda_free_memory
 elif is_ipex_available():
     SYSTEM = "ipex"
+    import intel_extension_for_pytorch  # noqa: F401
+
     if hasattr(torch, "xpu") and torch.xpu.is_available():
         empty_cache = torch.xpu.empty_cache
         synchronize = torch.xpu.synchronize
@@ -66,6 +66,11 @@ elif is_ipex_available():
         empty_cache = noop
         synchronize = noop
         get_free_memory = get_cpu_free_memory
+elif hasattr(torch, "xpu") and torch.xpu.is_available():
+    SYSTEM = "xpu"
+    empty_cache = torch.xpu.empty_cache
+    synchronize = torch.xpu.synchronize
+    get_free_memory = get_xpu_free_memory
 else:
     SYSTEM = "cpu"
 
diff --git a/server/text_generation_server/utils/log.py b/server/text_generation_server/utils/log.py
index b1456f1e5513391029845470270f0d32f8836c6f..4385c71ee967d7651e510721e6561eec37ae78b5 100644
--- a/server/text_generation_server/utils/log.py
+++ b/server/text_generation_server/utils/log.py
@@ -1,6 +1,15 @@
 from functools import lru_cache
+from text_generation_server.utils.dist import RANK
 
 
 @lru_cache(10)
-def log_once(log, msg: str):
-    log(msg)
+def log_once(log, msg: str, master=True):
+    if master:
+        log_master(log, msg)
+    else:
+        log(msg)
+
+
+def log_master(log, msg: str):
+    if RANK == 0:
+        log(msg)
diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py
index 6b9154370e2b0c38d2feb9ce620f76c5cae0eea7..9abd886f25023630fcf8cb82fc27a40c7c4c6cc4 100644
--- a/server/text_generation_server/utils/logits_process.py
+++ b/server/text_generation_server/utils/logits_process.py
@@ -521,7 +521,13 @@ class GrammarLogitProcessor(LogitsProcessor):
     def _cached_compile_fsm(grammar_type, schema, tokenizer):
         start_time = time.time()
         if grammar_type == GrammarType.GRAMMAR_TYPE_JSON:
-            schema = build_regex_from_schema(schema)
+            try:
+                schema = build_regex_from_schema(schema)
+            # TODO: this is only here short term to avoid crashing the python server, mid term we want this in the rust/router layer
+            except Exception as e:
+                logger.error(f"Error compiling FSM, grammar won't be enforced \n{e}")
+                # allows everything
+                schema = "(.*?)"
         elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
             pass  # schema is already a regex just here for clarity
         fsm = RegexFSM(schema, tokenizer)
diff --git a/server/text_generation_server/utils/merges/strategies.py b/server/text_generation_server/utils/merges/strategies.py
index 3b88531361cb44f7de7d8f57f0b69a61cb86f01e..cb39cde1f3e5506e2f7ebcca678507c15668095a 100644
--- a/server/text_generation_server/utils/merges/strategies.py
+++ b/server/text_generation_server/utils/merges/strategies.py
@@ -2,9 +2,17 @@ import copy
 from abc import ABC
 from collections import defaultdict
 from typing import TYPE_CHECKING, Dict, List, Tuple, Type, Union
-
+from text_generation_server.utils.merges.utils import (
+    calculate_majority_sign_mask,
+    disjoint_merge,
+    prune,
+)
 import torch
 
+if TYPE_CHECKING:
+    from text_generation_server.adapters.lora import LoraConfig
+    from text_generation_server.utils.adapter import ModuleMap
+
 
 class AdapterParameters:
     def __init__(
@@ -17,17 +25,6 @@ class AdapterParameters:
         self.majority_sign_method = majority_sign_method
 
 
-from text_generation_server.utils.merges.utils import (
-    calculate_majority_sign_mask,
-    disjoint_merge,
-    prune,
-)
-
-if TYPE_CHECKING:
-    from text_generation_server.adapters.lora import LoraConfig
-    from text_generation_server.utils.adapter import ModuleMap
-
-
 def _apply_weights(
     tensors: Union[torch.Tensor, List[torch.Tensor]], w: torch.Tensor
 ) -> torch.Tensor:
diff --git a/server/text_generation_server/utils/peft.py b/server/text_generation_server/utils/peft.py
index 0ea89267b856ec2f882f0d4b2fec655ee298376b..d49e73f0096deee0c3615557d5be49a059170a4a 100644
--- a/server/text_generation_server/utils/peft.py
+++ b/server/text_generation_server/utils/peft.py
@@ -28,7 +28,7 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
             low_cpu_mem_usage=True,
         )
     logger.info("Peft model detected.")
-    logger.info(f"Merging the lora weights.")
+    logger.info("Merging the lora weights.")
 
     base_model_id = model.peft_config["default"].base_model_name_or_path
 
diff --git a/server/text_generation_server/utils/prefill_chunking.py b/server/text_generation_server/utils/prefill_chunking.py
new file mode 100644
index 0000000000000000000000000000000000000000..c227d30f51289d21d08173e692b11a6cb8650462
--- /dev/null
+++ b/server/text_generation_server/utils/prefill_chunking.py
@@ -0,0 +1,24 @@
+from typing import Optional
+
+SUPPORT_CHUNKING: Optional[bool] = None
+MAX_PREFILL_TOKENS: Optional[int] = None
+
+
+def set_support_chunking(support_chunking: bool):
+    global SUPPORT_CHUNKING
+    SUPPORT_CHUNKING = support_chunking
+
+
+def get_support_chunking() -> bool:
+    global SUPPORT_CHUNKING
+    return SUPPORT_CHUNKING
+
+
+def set_max_prefill_tokens(max_prefill_tokens: int):
+    global MAX_PREFILL_TOKENS
+    MAX_PREFILL_TOKENS = max_prefill_tokens
+
+
+def get_max_prefill_tokens() -> int:
+    global MAX_PREFILL_TOKENS
+    return MAX_PREFILL_TOKENS
diff --git a/server/text_generation_server/utils/quantization.py b/server/text_generation_server/utils/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee561acc4ec0270b7dcc5bb04b9e15ccd896567c
--- /dev/null
+++ b/server/text_generation_server/utils/quantization.py
@@ -0,0 +1,202 @@
+import json
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+from huggingface_hub import hf_hub_download
+from text_generation_server.layers.marlin.gptq import can_use_gptq_marlin
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    WeightsLoader,
+)
+
+
+# TODO: Split this config to have a single config type per quant method
+@dataclass
+class _QuantizerConfig:
+    bits: int
+    checkpoint_format: Optional[str]
+    desc_act: bool
+    groupsize: int
+    quant_method: str
+    sym: bool
+
+
+@dataclass
+class _FP8QuantizerConfig:
+    activation_scale_ub: float
+
+
+# We should probably do this with Pytantic JSON deserialization,
+# but for now we'll stay close to the old _set_gptq_params.
+def _get_quantizer_config(model_id, revision):
+    bits = 4
+    groupsize = -1
+    quant_method = "gptq"
+    checkpoint_format = None
+    sym = False
+    desc_act = False
+
+    filename = "config.json"
+    try:
+        if os.path.exists(os.path.join(model_id, filename)):
+            filename = os.path.join(model_id, filename)
+        else:
+            filename = hf_hub_download(model_id, filename=filename, revision=revision)
+        with open(filename, "r") as f:
+            data = json.load(f)
+
+        # FP8 config
+        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
+            return _FP8QuantizerConfig(
+                activation_scale_ub=data["quantization_config"]["activation_scale_ub"]
+            )
+
+        if "zero_point" in data["quantization_config"]:
+            sym = not data["quantization_config"]["zero_point"]
+            quant_method = "awq"
+        elif "sym" in data["quantization_config"]:
+            sym = data["quantization_config"]["sym"]
+
+        bits = data["quantization_config"]["bits"]
+        groupsize = data["quantization_config"]["group_size"]
+        # Order is important here, desc_act is missing on some real models
+        quant_method = data["quantization_config"]["quant_method"]
+        checkpoint_format = data["quantization_config"].get("checkpoint_format")
+        desc_act = data["quantization_config"]["desc_act"]
+    except Exception:
+        filename = "quantize_config.json"
+        try:
+            if os.path.exists(os.path.join(model_id, filename)):
+                filename = os.path.join(model_id, filename)
+            else:
+                filename = hf_hub_download(
+                    model_id, filename=filename, revision=revision
+                )
+            with open(filename, "r") as f:
+                data = json.load(f)
+            bits = data["bits"]
+            groupsize = data["group_size"]
+
+            if "zero_point" in data:
+                sym = not data["zero_point"]
+                quant_method = "awq"
+            elif "sym" in data:
+                sym = data["sym"]
+
+            desc_act = data["desc_act"]
+            if "version" in data and data["version"] == "GEMM":
+                quant_method = "awq"
+        except Exception:
+            filename = "quant_config.json"
+            try:
+                if os.path.exists(os.path.join(model_id, filename)):
+                    filename = os.path.join(model_id, filename)
+                else:
+                    filename = hf_hub_download(
+                        model_id, filename=filename, revision=revision
+                    )
+                with open(filename, "r") as f:
+                    data = json.load(f)
+                bits = data["w_bit"]
+                groupsize = data["q_group_size"]
+                desc_act = data["desc_act"]
+                if "version" in data and data["version"] == "GEMM":
+                    quant_method = "awq"
+            except Exception:
+                pass
+
+    return _QuantizerConfig(
+        bits=bits,
+        groupsize=groupsize,
+        quant_method=quant_method,
+        checkpoint_format=checkpoint_format,
+        sym=sym,
+        desc_act=desc_act,
+    )
+
+
+def get_loader(
+    quantize: Optional[str], model_id: str, revision: Optional[str]
+) -> WeightsLoader:
+    quantizer_config = _get_quantizer_config(model_id, revision)
+    if quantize in {"awq", "gptq"}:
+        from text_generation_server.layers.gptq import GPTQWeightsLoader
+
+        # TODO: improve check once we have one config type per quantize value
+        if not isinstance(quantizer_config, _QuantizerConfig):
+            raise ValueError(
+                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
+            )
+
+        if can_use_gptq_marlin(
+            bits=quantizer_config.bits,
+            groupsize=quantizer_config.groupsize,
+            quant_method=quantizer_config.quant_method,
+            quantize=quantize,
+            sym=quantizer_config.sym,
+        ):
+            from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader
+
+            return GPTQMarlinWeightsLoader(
+                bits=quantizer_config.bits,
+                desc_act=quantizer_config.desc_act,
+                groupsize=quantizer_config.groupsize,
+                quant_method=quantizer_config.quant_method,
+                quantize=quantize,
+                sym=quantizer_config.sym,
+            )
+        else:
+            return GPTQWeightsLoader(
+                bits=quantizer_config.bits,
+                desc_act=quantizer_config.desc_act,
+                groupsize=quantizer_config.groupsize,
+                quant_method=quantizer_config.quant_method,
+                quantize=quantize,
+                sym=quantizer_config.sym,
+            )
+    elif quantize == "bitsandbytes":
+        from text_generation_server.layers.bnb import BNBWeight
+
+        return DefaultWeightsLoader(BNBWeight)
+    elif quantize == "bitsandbytes-fp4":
+        from text_generation_server.layers.bnb import BNBFP4Weight
+
+        return DefaultWeightsLoader(BNBFP4Weight)
+    elif quantize == "bitsandbytes-nf4":
+        from text_generation_server.layers.bnb import BNBNF4Weight
+
+        return DefaultWeightsLoader(BNBNF4Weight)
+    elif quantize == "eetq":
+        from text_generation_server.layers.eetq import EETQWeight
+
+        return DefaultWeightsLoader(EETQWeight)
+    elif quantize == "exl2":
+        from text_generation_server.layers.exl2 import Exl2WeightsLoader
+
+        return Exl2WeightsLoader()
+    elif quantize == "marlin":
+        from text_generation_server.layers.marlin import MarlinWeightsLoader
+
+        # TODO: improve check once we have one config type per quantize value
+        if not isinstance(quantizer_config, _QuantizerConfig):
+            raise ValueError(
+                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
+            )
+
+        return MarlinWeightsLoader(
+            bits=quantizer_config.bits,
+            is_marlin_24=quantizer_config.checkpoint_format == "marlin_24",
+        )
+    elif quantize == "fp8" or quantize is None:
+        from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
+
+        # Since the default for the quantize config is _QuantizerConfig,
+        # we need to add this check to not get an attribute error
+        activation_scale_ub = None
+        if isinstance(quantizer_config, _FP8QuantizerConfig):
+            activation_scale_ub = quantizer_config.activation_scale_ub
+
+        return HybridFP8UnquantLoader(activation_scale_ub, to_fp8=quantize == "fp8")
+    else:
+        raise ValueError(f"Unknown quantization method: {quantize}")
diff --git a/server/text_generation_server/utils/segments.py b/server/text_generation_server/utils/segments.py
index f596110210828ba3ca8c4afecfe7636f228ad14e..b3f923694e3191131562183ef1e0cab5fc1f9d34 100644
--- a/server/text_generation_server/utils/segments.py
+++ b/server/text_generation_server/utils/segments.py
@@ -7,6 +7,7 @@ from typing import List, Tuple, Union
 import torch
 
 
+# FIXME: this should be optimized
 def find_segments(
     adapter_indices: Union[torch.Tensor, List[int]]
 ) -> Tuple[List[int], List[int]]:
diff --git a/server/text_generation_server/utils/sgmv.py b/server/text_generation_server/utils/sgmv.py
index e0aec25fca4eca20885a33d6c6349007fa3abfa8..61cff1fd65d0acc7f93855bfdf561cb264a958e7 100644
--- a/server/text_generation_server/utils/sgmv.py
+++ b/server/text_generation_server/utils/sgmv.py
@@ -13,7 +13,7 @@ import torch.nn.functional as F
 try:
     import punica_kernels as _kernels
 
-    HAS_SGMV = not bool(os.environ.get("DISABLE_SGMV", ""))
+    HAS_SGMV = not bool(os.environ.get("DISABLE_SGMV", "False"))
 except ImportError:
     warnings.warn("Could not import SGMV kernel from Punica, falling back to loop.")
     _kernels = None
@@ -151,13 +151,17 @@ def get_tmp_expand_size(size: int) -> int:
 def get_tmp_tensors(
     nsegments: int, lora_rank: int, device: torch.device
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    if use_cutlass_shrink(lora_rank) and has_sgmv():
+    use_cutlass = use_cutlass_shrink(lora_rank) and has_sgmv()
+    has_sgmv_available = has_sgmv()
+
+    if use_cutlass:
         tmp = get_tmp_tensor_for_size(nsegments, device)
         return tmp, tmp
+    elif has_sgmv_available:
+        return get_tmp_tensor(device), get_tmp_tensor_for_size(nsegments, device)
     else:
-        tmp_shrink = get_tmp_tensor(device)
-        tmp_expand = get_tmp_tensor_for_size_no_kernels(nsegments, device)
-        return tmp_shrink, tmp_expand
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
 
 
 def lora_a_sgmv_cutlass(
diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
index 22f86b60caf12657c9ba673f6d5ec9d56d3f68f8..9ab49665a75ab5a092c9a3a0d958ff457b00b566 100644
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@@ -1,7 +1,6 @@
 import re
 from typing import List, Optional, Tuple, Set, Union
 
-import math
 import torch
 from text_generation_server.pb import generate_pb2
 from text_generation_server.pb.generate_pb2 import FinishReason, GrammarType
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
index 3731fd249f75dbcdfe17fa901faf47049bb2949d..aae64acf3da323eaf2e59cb16d27b16394fd5bc8 100644
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@@ -1,13 +1,139 @@
-import os
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-from safetensors import safe_open, SafetensorError
 import torch
-from loguru import logger
-from huggingface_hub import hf_hub_download
-import json
-from text_generation_server.layers.gptq import GPTQParams
-from text_generation_server.utils.log import log_once
+
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Dict, List, Optional, Union, Type
+from safetensors import safe_open
+from dataclasses import dataclass
+
+from text_generation_server.utils.import_utils import SYSTEM
+
+
+class WeightsLoader(ABC):
+    """
+    Instances of this type implement higher-level weight loading.
+
+    At a low-level, every weight is stored in the Safetensors format.
+    The interpretation of weights may be different however, for instance
+    could be packed, quantized weights. Loaders are responsible for
+    interpreting the raw tensors, sharding tensors in a manner compatible
+    with the format, etc.
+    """
+
+    @abstractmethod
+    def get_weights(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply without tensor paralllism.
+        """
+        ...
+
+    @abstractmethod
+    def get_weights_col_packed(
+        self,
+        weights: "Weights",
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        """
+        Get the packed weights at the given prefix with column-splitting for
+        tensor parallelism. This method should be used when multiple different
+        weights are packed into a tensor, for instance, query/key/value
+        weights or a gate/up projection.
+
+        The `block_sizes` determines the proportions of the packed tensors.
+        The columns are split in equally sized blocks when `block_sizes` is an
+        `int`, or in blocks proportional given to the sizes. For instance
+        `[2, 1, 1]` will divide an input with dimensionality `1024` in
+        `[512, 256, 256]`.
+        """
+        ...
+
+    def get_weights_col(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply column-splitting for tensor
+        paralllism.
+        """
+        return weights.get_multi_weights_col([prefix], 0)
+
+    @abstractmethod
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        """
+        Get the weights at the given prefixes, column-split them for tensor
+        parallelim, and then concatenate the weights along the given dimension.
+        """
+        ...
+
+    @abstractmethod
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        """
+        Get the weights at the given prefix and apply row-splitting for tensor
+        parallism.
+        """
+        ...
+
+
+class Weight(ABC):
+    """Instances of this type implement unquantized/quantized/to-be
+    quantized weights."""
+
+    @abstractmethod
+    def get_linear(self, bias: torch.Tensor):
+        """Create a linear layer from this weight."""
+        ...
+
+
+@dataclass
+class UnquantizedWeight(Weight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        from text_generation_server.layers.linear import FastLinear, FastLinearROCm
+
+        if SYSTEM == "rocm":
+            return FastLinearROCm(self.weight, bias)
+        else:
+            return FastLinear(self.weight, bias)
+
+
+class DefaultWeightsLoader(WeightsLoader):
+    """Weight loader that loads (unquantized) Torch tensors."""
+
+    def __init__(self, weight_class: Type[UnquantizedWeight]):
+        """Create a loader. Weights will be wrapped using the given `weights_class`,
+        normally this will be `UnquantizedWeight`, but a quantizer-specific class
+        such as `Fp8Weight` can be used to quantize the weights during loading.
+        """
+        self.weight_class = weight_class
+
+    """
+    Loader that uses tensors as-is with the exception of applying sharding
+    and/or concatenation.
+    """
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        return weights.get_tensor(f"{prefix}.weight")
+
+    def get_weights_col_packed(
+        self,
+        weights: "Weights",
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        return self.weight_class(
+            weights.get_packed_sharded(
+                f"{prefix}.weight", dim=0, block_sizes=block_sizes
+            ),
+        )
+
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        w = [weights.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
+        return self.weight_class(torch.cat(w, dim=dim))
+
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        return self.weight_class(
+            weights.get_sharded(f"{prefix}.weight", dim=1),
+        )
 
 
 class Weights:
@@ -17,6 +143,7 @@ class Weights:
         device,
         dtype,
         process_group,
+        weights_loader: WeightsLoader,
         aliases: Optional[Dict[str, List[str]]] = None,
         prefix: Optional[str] = None,
     ):
@@ -37,6 +164,7 @@ class Weights:
         self.dtype = dtype
         self.process_group = process_group
         self.prefix = prefix
+        self.weights_loader = weights_loader
         self._handles = {}
 
     def _get_handle(self, filename):
@@ -69,23 +197,43 @@ class Weights:
         slice_ = f.get_slice(tensor_name)
         return slice_
 
+    def has_tensor(self, tensor_name: str):
+        try:
+            self.get_filename(tensor_name)
+        except Exception:
+            return False
+        return True
+
     def get_shape(self, tensor_name: str):
         return self._get_slice(tensor_name).get_shape()
 
-    def get_tensor(self, tensor_name: str, to_device=True):
+    def get_tensor(
+        self, tensor_name: str, to_device: bool = True, to_dtype: bool = True
+    ) -> torch.Tensor:
         filename, tensor_name = self.get_filename(tensor_name)
         f = self._get_handle(filename)
         tensor = f.get_tensor(tensor_name)
         # Special case for gptq which shouldn't convert
         # u4 which are disguised as int32. Exl2 uses int16
-        # as well.
-        if tensor.dtype not in [torch.int16, torch.int32, torch.int64]:
+        # as well. FP8 uses torch.float8_e4m3fn
+        if (
+            tensor.dtype
+            not in [
+                torch.float8_e4m3fn,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]
+            and to_dtype
+        ):
             tensor = tensor.to(dtype=self.dtype)
         if to_device:
             tensor = tensor.to(device=self.device)
         return tensor
 
-    def get_partial_sharded(self, tensor_name: str, dim: int):
+    def get_partial_sharded(
+        self, tensor_name: str, dim: int, to_device=True, to_dtype=True
+    ):
         filename, tensor_name = self.get_filename(tensor_name)
         f = self._get_handle(filename)
         slice_ = f.get_slice(tensor_name)
@@ -105,12 +253,17 @@ class Weights:
             raise NotImplementedError("Let's make that generic when needed")
         # Special case for gptq which shouldn't convert
         # u4 which are disguised as int32. exl2 uses int16.
-        if tensor.dtype not in (torch.int16, torch.int32):
+        # FP8 uses torch.float8_e4m3fn.
+        if (
+            tensor.dtype not in (torch.float8_e4m3fn, torch.int16, torch.int32)
+            and to_dtype
+        ):
             tensor = tensor.to(dtype=self.dtype)
-        tensor = tensor.to(device=self.device)
+        if to_device:
+            tensor = tensor.to(device=self.device)
         return tensor
 
-    def get_sharded(self, tensor_name: str, dim: int):
+    def get_sharded(self, tensor_name: str, dim: int, to_device=True, to_dtype=True):
         filename, tensor_name = self.get_filename(tensor_name)
         f = self._get_handle(filename)
         slice_ = f.get_slice(tensor_name)
@@ -119,10 +272,16 @@ class Weights:
         assert (
             size % world_size == 0
         ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
-        return self.get_partial_sharded(tensor_name, dim)
+        return self.get_partial_sharded(
+            tensor_name, dim, to_device=to_device, to_dtype=to_dtype
+        )
 
     def get_packed_sharded(
-        self, tensor_name: str, dim: int, block_sizes: Union[int, List[int]]
+        self,
+        tensor_name: str,
+        dim: int,
+        block_sizes: Union[int, List[int]],
+        to_dtype=True,
     ) -> torch.Tensor:
         """
         Get a shard from a tensor that packs multiple tensors.
@@ -168,308 +327,51 @@ class Weights:
         tensor = tensor.to(device=self.device)
 
         # Avoid casting quantizer dtypes.
-        if tensor.dtype not in [torch.int16, torch.int32, torch.int64]:
+        if (
+            tensor.dtype
+            not in [
+                torch.float8_e4m3fn,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]
+            and to_dtype
+        ):
             tensor = tensor.to(dtype=self.dtype)
 
         return tensor
 
+    def get_weights(self, prefix: str):
+        return self.weights_loader.get_weights(self, prefix)
+
     def get_weights_col_packed_qkv(
         self,
         prefix: str,
-        quantize: str,
         num_heads: int,
         num_key_value_heads: int,
     ):
         return self.get_weights_col_packed(
-            prefix, quantize, [num_heads, num_key_value_heads, num_key_value_heads]
+            prefix, [num_heads, num_key_value_heads, num_key_value_heads]
         )
 
-    def get_weights_col_packed_gate_up(self, prefix: str, quantize: str):
-        return self.get_weights_col_packed(prefix, quantize, 2)
+    def get_weights_col_packed_gate_up(self, prefix: str):
+        return self.get_weights_col_packed(prefix, 2)
 
-    def get_weights_col_packed(
-        self, prefix: str, quantize: str, block_sizes: Union[int, List[int]]
-    ):
+    def get_weights_col_packed(self, prefix: str, block_sizes: Union[int, List[int]]):
         """
-        Highly specific when the underlying tensor is a simple cat of Q,K,V instead of being
-        already alternating Q,K,V within the main tensor.
-
         The columns are split in equally sized blocks when blocks is an `int`, or
         in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
         divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
         convenient for e.g. splitting QKV without knowing the storage details of
         quantized weights.
         """
-        if quantize in ["gptq", "awq"]:
-            from text_generation_server.layers.gptq import GPTQWeight
-            from text_generation_server.layers.marlin import (
-                can_use_gptq_marlin,
-                repack_gptq_for_marlin,
-            )
-
-            try:
-                qweight = self.get_packed_sharded(
-                    f"{prefix}.qweight", dim=1, block_sizes=block_sizes
-                )
-            except RuntimeError:
-                raise RuntimeError(
-                    f"Cannot load `{quantize}` weight, make sure the model is already quantized."
-                )
-            scales = self.get_packed_sharded(
-                f"{prefix}.scales", dim=1, block_sizes=block_sizes
-            )
-            scales = scales.to(dtype=self.dtype)
-
-            gptq_params = self._get_gptq_params()
-            if can_use_gptq_marlin(gptq_params, quantize):
-                g_idx = self.get_tensor(f"{prefix}.g_idx")
-                return repack_gptq_for_marlin(
-                    qweight=qweight,
-                    scales=scales,
-                    g_idx=g_idx,
-                    bits=gptq_params.bits,
-                    desc_act=gptq_params.desc_act,
-                    groupsize=gptq_params.groupsize,
-                    sym=gptq_params.sym,
-                    sharded_infeatures=False,
-                )
-
-            qzeros = self.get_packed_sharded(
-                f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
-            )
-            if quantize == "gptq" and gptq_params.quant_method == "gptq":
-                g_idx = self.get_tensor(f"{prefix}.g_idx")
-            elif quantize == "gptq" and gptq_params.quant_method == "awq":
-                log_once(
-                    logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
-                )
-                from text_generation_server.layers.awq.conversion_utils import (
-                    fast_awq_to_gptq,
-                )
-
-                qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
-                g_idx = (
-                    torch.arange(
-                        qweight.shape[0] * (32 // gptq_params.bits),
-                        device=qweight.device,
-                    )
-                    // gptq_params.groupsize
-                ).to(dtype=torch.int32)
-            else:
-                g_idx = None
-
-            weight = GPTQWeight(
-                qweight=qweight,
-                qzeros=qzeros,
-                scales=scales,
-                g_idx=g_idx,
-                bits=gptq_params.bits,
-                groupsize=gptq_params.groupsize,
-                use_exllama=False,
-            )
-        elif quantize == "marlin":
-            from text_generation_server.layers.marlin import (
-                GPTQMarlin24Weight,
-                MarlinWeight,
-                repack_gptq_for_marlin,
-            )
-
-            is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
-            if is_marlin_24:
-                B = self.get_packed_sharded(
-                    f"{prefix}.B_24", dim=1, block_sizes=block_sizes
-                )
-                B_meta = self.get_packed_sharded(
-                    f"{prefix}.B_meta", dim=1, block_sizes=block_sizes
-                )
-                s = self.get_packed_sharded(
-                    f"{prefix}.s", dim=1, block_sizes=block_sizes
-                )
-
-                gptq_params = self._get_gptq_params()
-                weight = GPTQMarlin24Weight(
-                    B=B, B_meta=B_meta, s=s, bits=gptq_params.bits
-                )
-            else:
-                B = self.get_packed_sharded(
-                    f"{prefix}.B", dim=1, block_sizes=block_sizes
-                )
-                s = self.get_packed_sharded(
-                    f"{prefix}.s", dim=1, block_sizes=block_sizes
-                )
-                weight = MarlinWeight(B=B, s=s)
-        else:
-            weight = self.get_packed_sharded(
-                f"{prefix}.weight", dim=0, block_sizes=block_sizes
-            )
-        return weight
-
-    def get_weights_col(self, prefix: str, quantize: str):
-        if quantize == "exl2":
-            from text_generation_server.layers.exl2 import Exl2Weight
-
-            try:
-                q_weight = self.get_tensor(f"{prefix}.q_weight")
-            except RuntimeError:
-                raise RuntimeError(
-                    f"Cannot load `exl2`-quantized weight, make sure the model is already quantized."
-                )
-
-            q_scale = self.get_tensor(f"{prefix}.q_scale")
-            q_invperm = self.get_tensor(f"{prefix}.q_invperm")
-            q_scale_max = self.get_tensor(f"{prefix}.q_scale_max")
-            q_groups = self.get_tensor(f"{prefix}.q_groups")
-
-            return Exl2Weight(
-                q_weight=q_weight,
-                q_scale=q_scale,
-                q_invperm=q_invperm,
-                q_scale_max=q_scale_max,
-                q_groups=q_groups,
-            )
-
-        return self.get_multi_weights_col([prefix], quantize, 0)
-
-    def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
-        if quantize == "exl2":
-            raise ValueError("get_multi_weights_col is not supported for exl2")
-        elif quantize in ["gptq", "awq"]:
-            from text_generation_server.layers.gptq import GPTQWeight
-            from text_generation_server.layers.marlin import (
-                can_use_gptq_marlin,
-                repack_gptq_for_marlin,
-            )
-
-            try:
-                qweight = torch.cat(
-                    [self.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
-                )
-            except RuntimeError:
-                raise RuntimeError(
-                    f"Cannot load `{quantize}` weight, make sure the model is already quantized"
-                )
-
-            scales = torch.cat(
-                [self.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
-            )
-
-            gptq_params = self._get_gptq_params()
-            if can_use_gptq_marlin(gptq_params, quantize):
-                w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
-                for w2 in w[1:]:
-                    torch.testing.assert_close(w2, w[0])
-                g_idx = w[0]
-
-                return repack_gptq_for_marlin(
-                    qweight=qweight,
-                    scales=scales,
-                    g_idx=g_idx,
-                    bits=gptq_params.bits,
-                    desc_act=gptq_params.desc_act,
-                    groupsize=gptq_params.groupsize,
-                    sym=gptq_params.sym,
-                    sharded_infeatures=False,
-                )
-
-            qzeros = torch.cat(
-                [self.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
-            )
-
-            from text_generation_server.layers.gptq import HAS_EXLLAMA
-
-            use_exllama = (
-                gptq_params.bits == 4
-                and HAS_EXLLAMA
-                and quantize == "gptq"
-                and not gptq_params.desc_act
-            )
-
-            if quantize == "gptq" and gptq_params.quant_method == "gptq":
-                w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
-                for w2 in w[1:]:
-                    torch.testing.assert_close(w2, w[0])
-                g_idx = w[0]
-            elif quantize == "gptq" and gptq_params.quant_method == "awq":
-                log_once(
-                    logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
-                )
-                from text_generation_server.layers.awq.conversion_utils import (
-                    fast_awq_to_gptq,
-                )
-
-                qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
-                if use_exllama:
-                    g_idx = None
-                else:
-                    g_idx = (
-                        torch.arange(
-                            qweight.shape[0] * (32 // gptq_params.bits),
-                            device=qweight.device,
-                        )
-                        // gptq_params.groupsize
-                    ).to(dtype=torch.int32)
-            else:
-                g_idx = None
-
-            weight = GPTQWeight(
-                qweight=qweight,
-                qzeros=qzeros,
-                scales=scales,
-                g_idx=g_idx,
-                bits=gptq_params.bits,
-                groupsize=gptq_params.groupsize,
-                use_exllama=use_exllama,
-            )
-        elif quantize == "marlin":
-            from text_generation_server.layers.gptq import GPTQWeight
-            from text_generation_server.layers.marlin import (
-                GPTQMarlin24Weight,
-                MarlinWeight,
-            )
-
-            is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
-            if is_marlin_24:
-                try:
-                    B = torch.cat(
-                        [self.get_sharded(f"{p}.B_24", dim=1) for p in prefixes], dim=1
-                    )
-                except RuntimeError:
-                    raise RuntimeError(
-                        f"Cannot load `{quantize}` weight, make sure the model is already quantized"
-                    )
-
-                B_meta = torch.cat(
-                    [self.get_sharded(f"{p}.B_meta", dim=1) for p in prefixes], dim=1
-                )
-
-                s = torch.cat(
-                    [self.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
-                )
-
-                gptq_params = self._get_gptq_params()
-                weight = GPTQMarlin24Weight(
-                    B=B, B_meta=B_meta, s=s, bits=gptq_params.bits
-                )
-            else:
-                try:
-                    B = torch.cat(
-                        [self.get_sharded(f"{p}.B", dim=1) for p in prefixes], dim=1
-                    )
-                except RuntimeError:
-                    raise RuntimeError(
-                        f"Cannot load `{quantize}` weight, make sure the model is already quantized"
-                    )
-                s = torch.cat(
-                    [self.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
-                )
-
-                weight = MarlinWeight(B=B, s=s)
+        return self.weights_loader.get_weights_col_packed(self, prefix, block_sizes)
 
-        else:
-            w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
-            weight = torch.cat(w, dim=dim)
+    def get_weights_col(self, prefix: str):
+        return self.weights_loader.get_weights_col(self, prefix)
 
-        return weight
+    def get_multi_weights_col(self, prefixes: List[str], dim: int):
+        return self.weights_loader.get_multi_weights_col(self, prefixes, dim)
 
     def get_tensor_shard(self, var, dim):
         world_size = self.process_group.size()
@@ -487,318 +389,26 @@ class Weights:
         tensor = tensor.to(device=self.device)
         return tensor
 
-    def get_multi_weights_row(self, prefix: str, quantize: str):
-        if quantize == "exl2":
-            from text_generation_server.layers.exl2 import Exl2Weight
-
-            try:
-                q_weight = self.get_tensor(f"{prefix}.q_weight")
-            except RuntimeError:
-                raise RuntimeError(
-                    f"Cannot load `exl2`-quantized weight, make sure the model is already quantized."
-                )
-
-            q_scale = self.get_tensor(f"{prefix}.q_scale")
-            q_invperm = self.get_tensor(f"{prefix}.q_invperm")
-            q_scale_max = self.get_tensor(f"{prefix}.q_scale_max")
-            q_groups = self.get_tensor(f"{prefix}.q_groups")
-
-            return Exl2Weight(
-                q_weight=q_weight,
-                q_scale=q_scale,
-                q_invperm=q_invperm,
-                q_scale_max=q_scale_max,
-                q_groups=q_groups,
-            )
-
-        elif quantize == "gptq":
-            from text_generation_server.layers.marlin import (
-                can_use_gptq_marlin,
-                repack_gptq_for_marlin,
-            )
-
-            gptq_params = self._get_gptq_params()
-            if can_use_gptq_marlin(gptq_params, quantize):
-                log_once(logger.info, "Using GPTQ-Marlin kernels")
-                try:
-                    qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
-                except RuntimeError:
-                    raise RuntimeError(
-                        f"Cannot load `{quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
-                    )
-
-                g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
-                if gptq_params.desc_act or gptq_params.groupsize == -1:
-                    scales = self.get_tensor(f"{prefix}.scales")
-                else:
-                    scales = self.get_sharded(f"{prefix}.scales", dim=0)
-
-                sharded_in_features = self.process_group.size() > 1
-
-                return repack_gptq_for_marlin(
-                    qweight=qweight,
-                    scales=scales,
-                    g_idx=g_idx,
-                    bits=gptq_params.bits,
-                    desc_act=gptq_params.desc_act,
-                    groupsize=gptq_params.groupsize,
-                    sym=gptq_params.sym,
-                    sharded_infeatures=sharded_in_features,
-                )
-
-            use_exllama = True
-            if gptq_params.bits != 4:
-                use_exllama = False
-
-            if gptq_params.desc_act:
-                log_once(logger.warning, "Disabling exllama because desc_act=True")
-                use_exllama = False
-
-            try:
-                qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
-            except RuntimeError:
-                raise RuntimeError(
-                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
-                )
-
-            if gptq_params.quant_method == "gptq":
-                g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
-            elif gptq_params.quant_method == "awq":
-                g_idx = None
-
-            if self.process_group.size() > 1:
-                if g_idx is not None:
-                    if (
-                        not torch.equal(
-                            g_idx.cpu(),
-                            torch.tensor(
-                                [
-                                    i // gptq_params.groupsize
-                                    for i in range(g_idx.shape[0])
-                                ],
-                                dtype=torch.int32,
-                            ),
-                        )
-                        and not (g_idx == 0).all()
-                    ):
-                        # Exllama implementation does not support row tensor parallelism with act-order, as
-                        # it would require to reorder input activations that are split unto several GPUs
-                        use_exllama = False
-
-            from text_generation_server.layers.gptq import (
-                HAS_EXLLAMA,
-                CAN_EXLLAMA,
-                GPTQWeight,
-            )
-
-            if use_exllama:
-                if not HAS_EXLLAMA:
-                    if CAN_EXLLAMA:
-                        log_once(
-                            logger.warning,
-                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
-                        )
-                    use_exllama = False
-                else:
-                    log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
+    def get_weights_row(self, prefix: str):
+        return self.weights_loader.get_weights_row(self, prefix)
 
-            if use_exllama and gptq_params.groupsize != -1:
-                qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
-                scales = self.get_sharded(f"{prefix}.scales", dim=0)
-            else:
-                qzeros = self.get_tensor(f"{prefix}.qzeros")
-                scales = self.get_tensor(f"{prefix}.scales")
-
-            if use_exllama and g_idx is not None:
-                g_idx = g_idx - g_idx[0]
-
-            if gptq_params.quant_method == "awq":
-                log_once(
-                    logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
-                )
-                from text_generation_server.layers.awq.conversion_utils import (
-                    fast_awq_to_gptq,
-                )
-
-                qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
-                if use_exllama:
-                    g_idx = None
-                else:
-                    g_idx = (
-                        torch.arange(
-                            qweight.shape[0] * (32 // gptq_params.bits),
-                            device=qweight.device,
-                        )
-                        // gptq_params.groupsize
-                    ).to(dtype=torch.int32)
-
-            weight = GPTQWeight(
-                qweight=qweight,
-                qzeros=qzeros,
-                scales=scales,
-                g_idx=g_idx,
-                bits=gptq_params.bits,
-                groupsize=gptq_params.groupsize,
-                use_exllama=use_exllama,
-            )
-        elif quantize == "awq":
-            from text_generation_server.layers.gptq import GPTQWeight
-
-            gptq_params = self._get_gptq_params()
-
-            try:
-                qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
-            except RuntimeError:
-                raise RuntimeError(
-                    "Cannot load `awq` weight, make sure the model is already quantized"
-                )
-
-            qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
-            scales = self.get_sharded(f"{prefix}.scales", dim=0)
-            g_idx = None
-            use_exllama = False
-
-            weight = GPTQWeight(
-                qweight=qweight,
-                qzeros=qzeros,
-                scales=scales,
-                g_idx=g_idx,
-                bits=gptq_params.bits,
-                groupsize=gptq_params.groupsize,
-                use_exllama=use_exllama,
-            )
-        elif quantize == "marlin":
-            from text_generation_server.layers.gptq import GPTQWeight
-            from text_generation_server.layers.marlin import (
-                GPTQMarlin24Weight,
-                MarlinWeight,
-            )
-
-            is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
-            if is_marlin_24:
-                try:
-                    B = self.get_sharded(f"{prefix}.B_24", dim=0)
-                except RuntimeError:
-                    raise RuntimeError(
-                        "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
-                    )
-
-                B_meta = self.get_sharded(f"{prefix}.B_meta", dim=0)
-                num_groups = self._get_slice(f"{prefix}.s").get_shape()[0]
-                if num_groups == 1:
-                    # The number of groups is 1 when groupsize == -1. share
-                    # scales between all shards in this case.
-                    s = self.get_tensor(f"{prefix}.s")
-                else:
-                    s = self.get_sharded(f"{prefix}.s", dim=0)
-
-                gptq_params = self._get_gptq_params()
-                weight = GPTQMarlin24Weight(
-                    B=B, B_meta=B_meta, s=s, bits=gptq_params.bits
-                )
-            else:
-                try:
-                    B = self.get_sharded(f"{prefix}.B", dim=0)
-                except RuntimeError:
-                    raise RuntimeError(
-                        "Cannot load `marlin` weight, make sure the model is already quantized."
-                    )
-
-                num_groups = self._get_slice(f"{prefix}.s").get_shape()[0]
-                if num_groups == 1:
-                    # The number of groups is 1 when groupsize == -1. share
-                    # scales between all shards in this case.
-                    s = self.get_tensor(f"{prefix}.s")
-                else:
-                    s = self.get_sharded(f"{prefix}.s", dim=0)
-                weight = MarlinWeight(B=B, s=s)
-        else:
-            weight = self.get_sharded(f"{prefix}.weight", dim=1)
-        return weight
+    @contextmanager
+    def use_loader(self, weights_loader: WeightsLoader):
+        """
+        This method is a context manager that can be used to use `Weights` with
+        a different loader for the duration of the context.
+        """
 
-    def _get_gptq_params(self) -> GPTQParams:
+        old_loader = self.weights_loader
+        self.weights_loader = weights_loader
         try:
-            bits = self.get_tensor("gptq_bits").item()
-            groupsize = self.get_tensor("gptq_groupsize").item()
-            checkpoint_format = getattr(self, "gptq_checkpoint_format", None)
-            desc_act = False
-            sym = False
-            quant_method = "gptq"
-        except (SafetensorError, RuntimeError) as e:
-            try:
-                bits = self.gptq_bits
-                groupsize = self.gptq_groupsize
-                checkpoint_format = getattr(self, "gptq_checkpoint_format", None)
-                desc_act = getattr(self, "gptq_desc_act", False)
-                quant_method = getattr(self, "quant_method", "gptq")
-                sym = getattr(self, "sym", True)
-            except Exception:
-                raise e
-
-        return GPTQParams(
-            bits=bits,
-            checkpoint_format=checkpoint_format,
-            desc_act=desc_act,
-            groupsize=groupsize,
-            quant_method=quant_method,
-            sym=sym,
-        )
+            yield
+        finally:
+            self.weights_loader = old_loader
 
-    def _set_gptq_params(self, model_id, revision):
-        filename = "config.json"
-        try:
-            if os.path.exists(os.path.join(model_id, filename)):
-                filename = os.path.join(model_id, filename)
-            else:
-                filename = hf_hub_download(
-                    model_id, filename=filename, revision=revision
-                )
-            with open(filename, "r") as f:
-                data = json.load(f)
-            self.gptq_bits = data["quantization_config"]["bits"]
-            self.gptq_groupsize = data["quantization_config"]["group_size"]
-            # Order is important here, desc_act is missing on some real models
-            self.quant_method = data["quantization_config"]["quant_method"]
-            self.gptq_checkpoint_format = data["quantization_config"].get(
-                "checkpoint_format"
-            )
-            self.gptq_sym = data["quantization_config"]["sym"]
-            self.gptq_desc_act = data["quantization_config"]["desc_act"]
-        except Exception:
-            filename = "quantize_config.json"
-            try:
-                if os.path.exists(os.path.join(model_id, filename)):
-                    filename = os.path.join(model_id, filename)
-                else:
-                    filename = hf_hub_download(
-                        model_id, filename=filename, revision=revision
-                    )
-                with open(filename, "r") as f:
-                    data = json.load(f)
-                self.gptq_bits = data["bits"]
-                self.gptq_groupsize = data["group_size"]
-                self.gptq_sym = data["sym"]
-                self.gptq_desc_act = data["desc_act"]
-                if "version" in data and data["version"] == "GEMM":
-                    self.quant_method = "awq"
-            except Exception:
-                filename = "quant_config.json"
-                try:
-                    if os.path.exists(os.path.join(model_id, filename)):
-                        filename = os.path.join(model_id, filename)
-                    else:
-                        filename = hf_hub_download(
-                            model_id, filename=filename, revision=revision
-                        )
-                    with open(filename, "r") as f:
-                        data = json.load(f)
-                    self.gptq_bits = data["w_bit"]
-                    self.gptq_groupsize = data["q_group_size"]
-                    self.gptq_desc_act = data["desc_act"]
-                    if "version" in data and data["version"] == "GEMM":
-                        self.quant_method = "awq"
-                except Exception:
-                    pass
+    @property
+    def loader(self):
+        return self.weights_loader
 
 
 def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]]) -> List[int]:
diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh
index ea94dcd9fd4147250a6866f8f6f46a0978cafef4..278c7d961caafab4f0fd6651d2b0544b68e32a14 100755
--- a/tgi-entrypoint.sh
+++ b/tgi-entrypoint.sh
@@ -2,4 +2,4 @@
 
 ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
 
-text-generation-launcher $@
+exec text-generation-launcher $@
diff --git a/update_doc.py b/update_doc.py
index 1ff94a2c0e62434c9ea178eaa185cdb7d8b79682..6357cc0061bce745f1039519b4b5447c5675eb57 100644
--- a/update_doc.py
+++ b/update_doc.py
@@ -5,14 +5,13 @@ import json
 import os
 
 TEMPLATE = """
-# Supported Models and Hardware
+# Supported Models
 
-Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported.
-
-## Supported Models
+Text Generation Inference enables serving optimized models. The following sections list which models (VLMs & LLMs) are supported.
 
 SUPPORTED_MODELS
 
+
 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
 
 ```python
@@ -63,7 +62,7 @@ def check_cli(check: bool):
     final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
     block = []
 
-    filename = "docs/source/basic_tutorials/launcher.md"
+    filename = "docs/source/reference/launcher.md"
     if check:
         with open(filename, "r") as f:
             doc = f.read()
@@ -155,7 +154,7 @@ def check_openapi(check: bool):
                 filename,
             ],
             capture_output=True,
-        ).stdout.decode()
+        ).stdout.decode("utf-8")
         os.remove(tmp_filename)
 
         if diff:
@@ -164,11 +163,31 @@ def check_openapi(check: bool):
                 "OpenAPI documentation is not up-to-date, run `python update_doc.py` in order to update it"
             )
 
-        return True
     else:
         os.rename(tmp_filename, filename)
         print("OpenAPI documentation updated.")
-        return True
+    p = subprocess.run(
+        [
+            "redocly",
+            # allow for trailing whitespace since it's not significant
+            # and the precommit hook will remove it
+            "lint",
+            "--skip-rule",
+            "security-defined",
+            filename,
+        ],
+        capture_output=True,
+    )
+    errors = p.stderr.decode("utf-8")
+    # The openapi specs fails on `exclusive_minimum` which is expected to be a boolean where
+    # utoipa outputs a value instead: https://github.com/juhaku/utoipa/issues/969
+    print(errors)
+    if p.returncode != 0:
+        print(errors)
+        raise Exception(
+            f"OpenAPI documentation is invalid, `redocly lint {filename}` showed some error:\n {errors}"
+        )
+    return True
 
 
 def main():