docs: migrate Fern docs from fern/ into docs/ (#6206)

Signed-off-by: Jont828 <jt572@cornell.edu>

docs: migrate Fern docs from fern/ into docs/ (#6206)
Signed-off-by: Jont828 <jt572@cornell.edu>
39d645e5 · Jonathan Tong · GitHub · d381e6ff · d381e6ff · d381e6ff
Unverified Commit 39d645e5 authored Feb 11, 2026 by Jonathan Tong Committed by GitHub Feb 11, 2026
20 changed files
--- a/docs/images/kv_cache_mgr.png
+++ b/docs/images/kv_cache_mgr.png
--- a/docs/images/kv_cache_mgr_design.png
+++ b/docs/images/kv_cache_mgr_design.png
--- a/docs/images/kv_routing.png
+++ b/docs/images/kv_routing.png
--- a/docs/images/kvbm_agg_performance.png
+++ b/docs/images/kvbm_agg_performance.png
--- a/docs/images/kvbm_metrics_grafana.png
+++ b/docs/images/kvbm_metrics_grafana.png
--- a/docs/images/param_mapping.svg
+++ b/docs/images/param_mapping.svg
-<?xml version="1.0" encoding="utf-8"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" data-d2-version="0.7.1" preserveAspectRatio="xMinYMin meet" viewBox="0 0 962 567"><svg class="d2-2851460652 d2-svg" width="962" height="567" viewBox="-29 -29 962 567"><rect x="-29.000000" y="-29.000000" width="962.000000" height="567.000000" rx="0.000000" fill="transparent" class=" fill-N7" stroke-width="0" /><style type="text/css"><![CDATA[
-.d2-2851460652 .text-mono {
-	font-family: "d2-2851460652-font-mono";
-}
-@font-face {
-	font-family: d2-2851460652-font-mono;
-	src: url("data:application/font-woff;base64,d09GRgABAAAAABNUAAoAAAAAIGgAAgm6AAAAAAAAAAAAAAAAAAAAAAAAAABPUy8yAAAA9AAAAGAAAABgld/X+GNtYXAAAAFUAAAArwAAAPYE0AVEZ2x5ZgAAAgQAAAkYAAAL7I/vtQ9oZWFkAAALHAAAADYAAAA2GanOOmhoZWEAAAtUAAAAJAAAACQGMwCzaG10eAAAC3gAAACGAAAAsGcgD4xsb2NhAAAMAAAAAFoAAABaSHZFiG1heHAAAAxcAAAAIAAAACAAYAJhbmFtZQAADHwAAAa4AAAQztydAx9wb3N0AAATNAAAACAAAAAg/7gAMwADAlgBkAAFAAACigJYAAAASwKKAlgAAAFeADIBIwAAAgsFCQMEAwICBCAAAvcCADgDAAAAAAAAAABBREJPAEAAIP//Au7/BgAAA9gBEWAAAZ8AAAAAAeYClAAAACAAA3ichM05SgMBAEbhL8m4x33fx3GLoGglIlhYScBWxMqbeCDvoCh4EZsIaUVshF+I6X31Bw81DTU0FbqolAp1pcqBQ0eOnTpz7sKVtht37j0k9Eyrb0765lLbtds/k/ckP/nMV77zkU466eY1L3nOU97y2Dv/V82WHbs2VfYsWrJsxao16zaU9tU1FAYMGjJsxKgxTeMmTJoybcasOfMWbGvxCwAA//8BAAD//5kXLfkAeJx0Vn1sG/X5f77POecmddscju04dfx28V1e/JL4fHdx0sSxk9hOmrROqJu0zYuD0zRN2/SFN7WU/EqhQPsr4rrxThhjlUYRU2n/WdmmvbMKpIkhtCG2FQQTQggmNCEjbRI5T3d22vSPybK/X935nu/n+Tyf5/McVEA3ANbhU0BBJZjgDrAACIyH8Xl4njUaZd4myDLrQqab3FAVQgYiBuneU6deN7QlvkrM/B8+tXKw45F9+zKff/Gzqfvvf+Jz8i4guAGwHRWoBAbAbBR4juNZmqbMgpnlWeMXrrdcjGeTodr914+mPhrv/jpGDs/Oyoei0UPqblRWjrz9NgAABXkAZFGBarCDV8MlhK1WSw1ttOgLSwlhSYxwLMusbvI/790bbe9MZ84u3rdrx8Dw0OTCjsk9OxdQcSc72rZvMqzf1jczTU5Ksuhf+a6zt0sEIBAvFrAZl6EeoMLLcWJEkoSw1WbkONZL05Yaq1UIS7KNpsn0yENDQ2eynROOkD3RFJuMRCZjgbQrxOdNI88dWHhutNUtbvbE7xsdPZ7gWCEQBgCEMQBsQgXWaXzoWWjo+VXQYz96avnFC3cOHD18+OgAKq8tv3S57/9PnjwDGrZjAHgHKrBer4tl9XOMPKP+ilSr/yJDqCTfTX2dAgJnALBW5/3Wf5kz5Pvqb8lG9RtUkh8n1b8BAbFYQAsug+t/5SuEZZEVBYamycjoQwNbH87Gxx2h2lioa0LYPzPQ9PB7rr3lhIV6sc4bv2/05AX+9X71n64AEJgGIP8pYxYFhhU9FpYRLNMXL5IXLl5MIZVMrqykQM/vUQCsRwUqdGYYj+XRUdKPysq18v00AFajApv1+2abIJsFhmUikiSzRoqleNaJFiY9N+E2uCbnMhVGpHxTWyY4pOgKVNQvFhZI7coRknaPZR2nVJXgKUd2zK2+qcUeBUAaFTCvxuY4kREYLajVamFGJ96PIVZmSgsq6uzjbQciZMfKEbL8eHheUF8DhNZiARtxGTZpCNewqZWY5ksV9mqcEv/gsZ6eY4Ol36179mzdumePafT5gwvPZjLPLhx8fnRAWTp5/vzJJUXTzBwAulABU6kjVyOyLHNT7HPXBw51dS2m796/884d2f2oNGTT/bv96nckHU+mZNC111qcxBZchvXAAfjW4KnwcvxtaI23ZO4vwyMwtlg7fW8J9ZEJZlDYZK+srvZFldMa0tPKjfEEc/V7pQyevGJuaaYNvfQ67dzZsuY3gm1t55pZak0Gs3/s3deZ6X11+gf3HhoeGRk+hAo70jc0yaj/IBb1K7Ir1hOPlHTQWyygHZchoLPMyzpYMcJxPB/E2xWsNazN5kSNL9KWPuEP+/LtfVtdonfKE/fLM7Hu+Qa/e5sQTbKSY6IpzrfPm0R/hy/QEWSbHRubNjQnWsPbA4EGqd4T8bsa60yN1YF4WyQbBgLNABhEBYwAnrKyCX6Ihg9xMJlc+amOdXuxoOlFq5yuCEZgSh4i6VuaJoGevdEdDTG+sds3Es2bIsemyHPqXN9IQ8NIH3lBnZ86FgECVQC4DS+AD0CgBLMTbUI3yrJgK+/MAsVSJW81UouzU61UhYFQdFUV3ZPpNlZV0gakDFRwd26+x2iqoCqq1vXgBXV2cyDk8YT8mwuFzf7Sjry0cpisc3Y4nR1O9d96DhIAelEBO4BgpgSb1WoTJEmWBcrMlv3caJTeujYxsaG22rDRYto9fu2tifwGB2PYWLfhLnIn6bpk9dfX+62X1F+rV6/WCk6nUHtVj80BYBgV2ATgEW+LbSF4Yzxvbqgx1HDm6Z03viQ//r1vqLFxiPuduvtLbSakigVk8BysBxuwOr9iRCo56tpuk9dcJ/6FBx5YOHDixIHebLZX+9p9Prvd5zNdfvmVS5deefly4vTZcw8+eO7s6T81uFws63I16DhndO9dAtOqj0mSLDCCZebNJ9uHXV2v9pEPxHW26pXrfSWN9gNgFZ7TvUrsxhIYIy+VcdBGSRIEy8DBZ4b7UoFhV6hltjd3ZPDMmLPL8X5bTrlHlJMBd8gv7stueeCx7WjQPD1aLCCN5zTd3aZ7VlzNtnxAaTCWVe8aPco2u3d3xccXlo7mUp2BjNvfuK9ry3T7cGdLyh+bN8ms5Az2iNFkrD8ckhrqI2yAS0c6BmsMlS0Jf/uoHxB4APTgElRCDYAsaOlrZTeLHpFoPLCW018TilRs8lbH1WskNp7Pf3O9rqvOFrKpkSsyeVa9J3EFCLQXC1iFS+BZm8NN9GaPxWO85ZnEmT3oa3Lv6AkMtuxMNbZ7/RayX/2EcYi+rnxn70GT5JEcgYaEPzFYY3YQIfVL08aW8f7+mdKsHS4W0KrrowmAeMuuJkY4Svfmblwz6Yw3K0N6uDGuMpuMbkvG+w/3+LfmIp3jrqA90SCMychPte5a7Jwh6cbgxMxwLJ5Uf9J3di73WJbn6wXb5sj9+7hA/q6uiQhQ0FcsoBnPgROaQb41EWTxpteuqRN1S7BWqmxn+gAmwfT+aGPM2xgVdkWn5qONbNQjzdkyiZjYGxgmqWFxT3swNm4KZML+nmC1wT4Ybhtsyg0GtjsMTMuWYGhbgMx3JEOJ9hAXZtXrsbaA4DXbE61iPxD4Q7GFnIcX4Q4AGy9JMk2za9rlhq27Gw1VFe5gq9sfnPxLxJHpJITz+fj+zvHjUCzC34sF8iBSyOsczwGtrcUinCY58gG+gRw0PwFAQ/PdoF/PkhakyH6koUXvky3Fe9BR/AVQADbRY9lCPjmf0rT+XjFDPsYbsAGgQqdLG0aWGquNXJ0/fnw+MJvLzV4Z+ezChc9GmrPvLC29ky313Ylihjxeek7LR4zoGrPU0K8G9k5P7w3MHz9+pfxAs/44EMgXc+RbvK5jIALJk/Sg+uYPqbnvXihjht8gRf6MHLTAAdCQP62fNQRFNJGntalGvEG82X9OXB2ejzhDTbzF1sSIdWluzZ4Qu9fujdhSkr7WJiUtns4lvKS9K1Xc5lneUMjrDYVMIR8XCnG+EBD4luRIHt/Q3pUIzwtGI6m24yLaSe7TxcVPS1yUudbyMoseS5ZcJi3d3QDwXwAAAP//AQAA//+l2ZVwAAEAAAACCbr4CJlFXw889QADA+gAAAAA3B0N9wAAAADcHHNL/z/+OgMZBCQAAAADAAIAAAAAAAAAAQAAA9j+7wAAAlj/P/8/AxkAAQAAAAAAAAAAAAAAAAAAACx4nCyOsUqCAQAGjxt7iqaCppaiIIKoqKEIAvE2EcUHcHQU9M3c3X0Od/nhnz6+44YzXg2Ma2NhvBlTY23sjAdjZuyNb+PfuDdW4y6Nd+PO+DOujEfjxvgy5san8WzcGk/Gr/FhHIyTsTUmxotxNDZjy8B+Ruc8/AsAAAD//wEAAP//4MMfBQAAAAAAKgAqAE4AggCyANAA5gD6ASoBQAFQAX4BoAHMAfACLAJUApgCqgLOAwwDOgNYA44DpgPQBA4EMgRoBKgE+gUaBSYFMgU6BUYFYgV8BYwFmAXABdYF6AX2AAAAAQAAACwB+AAqAGUABgABAAAAAAAAAAAAAAAAAAMAA3icnJZLbJPZFcd/zrkBv3gZVA0IVVcjhKYIjJ1JwE0g4JABwiBCSWbaClHVJMaxSOzIdmDoYhZdVl11XXUzXbQStAolaiaBQiCkagWq1EU1q666qLroqppFV9V3vuPEcRI6g5DI7z7O/57Xvf6Ai3ILIeKiEUiCcYQkSeMODvGOsZDklLEjyUXjTpKMGm8jyQ+Nt5Ni0jjKYT41jnGYXxrHOcKfjROc4D/GSQYjR4x30hupGO/iYORXxrvpiiwb72nxM8XByJfGe1d1YsBKR8o4wjc7vjDuYGfHl8bCZXHGrmVPJ+Ny1XgbR+SR8Xaeyd+No3S7XxjH6HZ/NU7Q1bnNeIf4zpzxTrqj3ws5ArujPzWOsDv6c+MODkTvGwvJ6IqxIxU1/Ugnqeg/jLeRilosQf5jUeMoh2IHjGP4WL9xnKOxHxgnyMR+YpwkHVsw3kFX7J/GO8nFmzq7OBy/ZrybU/FPjPe0+Jzi3bjlKrK3RXPfqub+CKTifzOOkIo35zt4N/5fY2Ff4qCx40AiY9zJgcQl420cSIwbb2df4lPjKJnEz4xjvJd4bhznaOJfxgm6k98wTpJLNjV3cir5Y+NdZJJ/MN7NxeS/jfe0+Jmia8cJ472BjszKM1mUV3gKLVyijOcwnkm8PJY5vMzKgizJnDyWV/JE5uS5fCb35bH8Hh+5JEvyQP4kT/DysIXnW3hFPpMHsiQP5XNZkKd4l5UFeSlL8rksyqLOvjL7WfmjvMZzveMLbgRnyCN5oCqhLwtyX+ZlTpYDHa6T4YYsy0t5Jk/ld2q/onq/wcszmZXXsiizuvPYFjufynON8YUsy5wsyW/lRXOW6xzhhryQ1/JYHspTWQxODc6Wl3h5pDOzahPObO7joS1Ovo+XOXkis5qFIMvLzXn196ie3pJfjqqna3VryXfbWknHG/PeUhXbsVpJfo2niwxZMniO2ahLR3nGqXKTIp4R7lGnQZEp6niGqDBGlRrT+n9B18bxvMcEDRpM08txjnNX/6UprKql1XKK43wr8Ie7lGkwgecaReoUqXHH1M5TpUIDzxUKTAW++HcYocoMNcYo+v2kW8d4zlFlXOkqNaqqWmKGSQrU6CJNhvfJ0UeeQQYYpm+dQtM+tD7WZh9aDTPAB3ysvtYpq5d+nfYEVRoaaYU7eLK6liZLlhP0MUWB2xR11y2KfKIeBwo9pDlBDye0Ll/ds/VZKGudCngaWp9xrV2w7zaeKrfeusJljTWoWGD3ERWtX7g2QsN2hqdXGOe42nuNdEIz5lV5Ritbo6y702/lzVUKGr9nkDSei6Ya9NWoZjf4O6P9FvhdpPI1+rPBPaYpMsqE5XOtH0c0hw3uak7XMj5JWStQ0U4OcjKjWQjjbmZthCEu4xlW/co65cvrFIJI2vssq32U1tgmNj13rf53KFDWDrnJpK6s3beCnpvnO8oNevFt2akzphWapqE1qqtWWmtQ4jjDnOdymyf/P0fj+jes/U1mVrsnjC7omuCW5xnRyo/4/XgGdDzEiGbkuwwxykWG+YhRHee5xjXyXGGUIT5Q22Gu6XswzBUG1WJIOVw7rzfgCt/H8yFDuifQLlp+wooFN3Nava+r72Evl5liWnMeeJ7WWIsa4devsOeWqTZt62ozRplbutNr/Sp61wuUrCum1cMpzWWzN9ZuXdgRUxpLUNu19RJVfV9renMDVc89ezuCbg19Cl+Ixleoavqteqa+msOi+rx+XLLfgbK+jeGr0/xGGdFfgrL+fo2p14FtEFHwe9k+M79hZkVrVeMm5bDXZIVz3NPTJu0eeW5qbGoRfplQ1yrUtUaBRz9SlWrzm8ReiyolfZ+mNXNjeqPu6SjsAv0q2XJvwV69mmb9dvN7ZMPZwVs1ae++19hKpn6IGxSYNJWKvZSeCjP6+1nT1fCuaWxk3+hPu1K99UtlQxWP6tveXpP22m62S79m2ivjsuuqvZndijvjzrp+l3cDrt99G+8y7TOU3Md4l8O7v+BdHu9OuozLux53wfW6jDvlci7vMkp51+tygVXkknK/ap3RHafdh8GKPNxyZX7LlRU976zLrp3gskpnXc71uT6Xcxdcj65m3DDe9bqzLuMGgnGzB9XvC6rT6067c24gVHenXb/rc5ebvegGXM6dcf3ufdUYbDmz2/W4wcCzZi9uujf04KTrcj3upOt2/WGmmv24pR8n3WmXcb16Tr9GlQlUm525hV89VpFTGn+wZ8D1BBlp7bWNdQ764Y012pBvtdjQHW/Umd+sM95osfI/AAAA//8BAAD//5uVuAcAAwAAAAAAAP+1ADIAAAABAAAAAAAAAAAAAAAAAAAAAA==");
-}
-.d2-2851460652 .text-mono-bold {
-	font-family: "d2-2851460652-font-mono-bold";
-}
-@font-face {
-	font-family: d2-2851460652-font-mono-bold;
-	src: url("data:application/font-woff;base64,d09GRgABAAAAABHwAAwAAAAAHegAAQScAAAAAAAAAAAAAAAAAAAAAAAAAABPUy8yAAABHAAAAGAAAABgmKbWhWNtYXAAAAF8AAAArwAAAPYE0AVEZ2FzcAAAAiwAAAAIAAAACAAAABBnbHlmAAACNAAACUgAAAxgcue/J2hlYWQAAAt8AAAANgAAADYbI9ohaGhlYQAAC7QAAAAkAAAAJAYzAMBobXR4AAAL2AAAAIIAAACwZyAMZGxvY2EAAAxcAAAAWgAAAFpLTEgybWF4cAAADLgAAAAgAAAAIABgAmpuYW1lAAAM2AAABO8AAA2sAwZtKnBvc3QAABHIAAAAIAAAACD/uAAzcHJlcAAAEegAAAAHAAAAB2gGjIUABAJYArwABQAAAooCWAAAAEsCigJYAAABXgAyAR4AAAILAwkDBAMCAgQgAAL3AgA4AwAAAAAAAAAAQURCTwCgACD//wPY/u8AAAQkAcZgAAGfAAAAAAHeApQAAAAgAAN4nITNOUoDAQBG4S/JuMd938dxi6BoJSJYWEnAVsTKm3gg76AoeBGbCGlFbIRfiOl99QcPNQ01NBW6qJQKdaXKgUNHjp06c+7ClbYbd+49JPRMq29O+uZS27XbP5P3JD/5zFe+85FOOunmNS95zlPe8tg7/1fNlh27NlX2LFqybMWqNes2lPbVNRQGDBoybMSoMU3jJkyaMm3GrDnzFmxr8QsAAP//AQAA//+ZFy35AAABAAH//wAPeJyEVm1sW9X5f86xc00S58Wx77VjO/favva9fovfrn2dNyexEzuvjvNKXtskhQL9J01D0n+DFLcbQ6WAHCijXUO3IjTaD2MCsXXreBNfmPah7ANCk9AGiKkbVNqEBJP3BRF7utdOS2HVvtxzpHPueX6/5/n9nnOgAlgAHMHnQAGVoIYGIAE2NVaNQ+B5VqWK8nohGmUZrGFxQ+HKZadT6couLl5Rephd5v8P4nN7q/PpQ4dq3nhzfbG9/RdvoE0ADJUAOINzUAMagE2toGUVHMezBKFS8BErWfnea+/9bFJtVivVxpqpetSCc3vH0WBwTRDWgoVrP93cBAUMAuAQzoEGjBLGTY0QoihSR6hIeSBYhRASI2GOZTX7k8GPele7xLZUf+Lhvvm4GAiGE5mOtraODM41pbq843VK9VAiMeVCj3scdqYw5/V6OABAIBbzuA3vAg2QtPlwJCyKQojSqziOtREEqaMoISRG9QSB7o8dnQxM7Ex33m8d00ftvn63eyhobzOMOVfV7qmT46vPjwmWeapROBjvWQxZjLP+IGBIAeAwzkFVKbtlJgTLCyFRgs6xbOo3izsj6afmXIbwsMczHDbgXO8z6+s/7ttyLoyMzDpknAsAmMI5qJZrRFpJgWRJK7mAXiv8/auvEIdz2ce2f5KV9x4BwEack2pxe+8R9KvCjXwe57IXsnsg7/MW85jFu2CVuHPcXbhH2IigIQg0PfHE6OiTk4mDzBgZ4t29vC0laNmqxb9Zj6qHzq8dfX5UsMyTxhL9qqr1Rwp/YgJyHKmetWXsEUHDRqwkqxHIwd1d9Pvd3Sw6l80WVrMlTCsA2IJzUCFnS2MlV55DNpzb+7K83gaAaZyDJnldL0lUOjHciaOsSsXyPEsrSLLtYjelpLovZpWECitCoX7Br8AqQolzN+fmbu4dv24amxk1vnrp0qvG0Zkx0/XS2YlyjrXy2Vq9wHERCa+CZymKJBMXnmxRVtTvlAacK7x9NvzD1pt7x1Hy6cjJtpsg6Z8r5nEI70IdMHdkVa46v1/zcnqRd+REInFipPS1hUymkE3+qkcurB89n06fP7p+YeQHwcVU70IgsNCbWgxKMYYAcBDnQP0dTbHkLT8M3ezbTKaOJyeHYu2x9iGc4xcyw4f8n6BxMRR2gQK44gjuK+P0/W+kWhvHR2m874W74f5k+TH6fBUvmOpMVcb6wB00HgksD5doDC8HCjceGK2j2dp7KtYq1QAK2Sf9ZU6Nd2OljWrZ1M3kRjK5kZzsb4vF2vpbH3j/Cs45ZtNDB5v/iZaCgQBX+HqxcFaqZ6iYxzzehWaZHx+V9Syx4vnvOl3iqNfTWIqI3IlHI1OOWb+/2eBjJu3dfGylr33DO2TrDdh95gCT8Xba2h9WB3yHac7SqDeRNfZafzIgTke87gONJrpJa9SpbfX+Xp+40AJIUgJuxTlQSbxK6r/xLja+i+uz2b0vS9rrLuaxAedAX+5DGkFT8p8oT2sx8nXMttPP6ZwM49I9bW6fUbM9y93o2cISL5rNIo9eLBzrXu5hAYESAE/iHDgANhWCVkdRpNCJo1FBT2O9NNMKCpYvt2XV/PTzOoyUymp1hfegi6hSK5UIIdTwzOgLHFGFFYp7CA7nClfMkQhNR0TTtWumcJSmo2ETmt87fp3pbmrqZq5LHBoAcArn5PppFYKeovSCKEajgkJ7O1zDO1evdFUb6pTVVHXbC1ffudxX01irVBvUvSiDYttUmGHC1Hbh7cLrp40CTQvG03J+6sv6qJN6yB1nkzc+eCFR11SnrGdqExc/+Bi9dskxwPMDjkuF0Y9B0lZLMY8t+AzUQBO4bmldCP1XpUdttxeRd2At9vjasdOxo4MWCYyFCdN0mGG8Xsbi8ajT51Zeunz5pZVz6Q3n0vjEAYfjwMT4kvOs28a6XKzNDQgyALgJb0PNfu/rxFFBI5CZX+54ukKG1bNb6Ni8qo6s2/tmS+IpAGA9PgNmaX8nlmRvI1T8bWeqRFEQyLb5J4ejQa7LlAkeGUgcbo2vxEzdhguT6ZNHmv1B3pgRQqH5mLh2TFRUZKVz3cU81uIz+14vm6HEPFKa3BlIRZbdoBdnU4Zey6jHJjK2iaGuQVvI64hbB/2rseT/tXe3OuOO1IZa7zQdMNs1NoPZRT1EcY02e5NpjvO40i3BFKWsdnW62ic8gOQ3hw9vS/Vc1oTFqKwJK2mNIEGDppRIWe+oX0In9/6NKn+NNukMbRTNT60fRqf2SjrwFfNYh7elc0o85KZ0C77WSlpVt5OFTNGZXmOC9EVoewuTCDEea1yDThe+rtY7DO33daQ21AZn4wFNk0bvtQRCtQ1uFM9u3aNyz/T03ReW4nUU89i6nzepa3Ti79+R388egWLBhQGPL30o0vOgJa3rsLiSbluUNnpNYbe5iyHYGUd0Ntz1oNo+vDU5+6NJVzMz02By3zeZmXVQzfagh+aWHLx7uiexJEgajhXz2IzPAA0eaL/dr6ORW337WzVT7CtawqeQep4P87VYvnI8A6utVtHqjEQPdS0+LDpsItOyYguQziY2aGlVNEcpV4utwVxPh9XejMC3eeqV5v6QmHEvZrx9BmV9c6cnlPGjU95go5102I12c+Fj3klaDGRNDePQmZ2kXKdrRR69Ah9KN9Q0L4o8rycI9tvG+8jRE8cVHbVWrTlEJ3ydnSt/vlc/GNi6X11taLb6WufHFk8Xi/B6MY9exDaCl5yLeoCQxmIRxpEfE/gVJQ/u4jYAqMANkkKktRSicAM6RRDggVINF7C5eB0UANMRK9mB/np5bQ0Q/LaYRn/Bn0reTMrIpLuN1FF69MTyxsby+Nzo6NzV6c92dj6/Nz795taJt6ZLvfpUMY1eKv0ncYuEZTuROuJ9+Zfx5Y2Nq/Hpt05svTkdv/fznZ3PpDdQcQnX4j/IGJCABtHAduF3Lyoe+OZiGTP8HDegfxEceOA4SMhzcqww/APb0ctSTz1s82FJdCWZlXQv38aP0gHOrqs1GipnLFM+xs/ZdXWNhsoZZupLPa1vsJtdIz23JoBKOYVPpfdY8o5uJ8TjQri7W90bDCaTwWAvIPgQ+dGj+BXpPTbB84JKVWQrXq5gkf+LZ5/9AtB+riVeExErmUJ/RNRDDwHAfwAAAP//AQAA//8dNpNsAAEAAAABBJwvNIvQXw889QADA+gAAAAA3BxzpAAAAADdlx6g/0z+OgMMBCQAAQAGAAIAAAAAAAAAAQAAA9j+7wAAAlj/TP9MAwwAAQAAAAAAAAAAAAAAAAAAACx4nCyNsUkDUBgGj8PGTlzAQrAQsVAREQWxSJEUgVRXZIGQHbJVQjbJOuHBq37+4/jOeDQwbo2l8WksjK2xM14m3xvfxr/xZKzmHe6b8WD8GTfGvXFnfBlr4914nq1X48f4NU7G2djMncGPxmE2B/uYzmX8VwAAAP//AQAA///UWBvdAAAAAAAqACoATACCALIA1gDsAQABNAFKAVoBiAGqAdwB/gJGAnACtALGAuwDKANWA3QDsgPKA/YEOgRaBJIE1gUuBVIFXgVsBXQFgAWcBbYFxgXSBfwGEgYiBjAAAAABAAAALAH4ACoAbgAGAAEAAAAAAAAAAAAAAAAAAwADeJyclk1vG9UXxn9jp7bHTfvPP5TSFCiXEkoaJRM7SqMqRQK3aVVDSEqcUqFSCcd2nFH8JnvcNqxZsGTFZwDEqqsuEGKVBQuWiBUrxIoPgFggNGeOPWPXJG1VqXnu3PP6POfea+Cd2N/EscZs4AAUW5zjQHGMFL8rjrPCn4rHmLEuKD5G2VpXnGDaeqQ4yY/WL4pTLMW+UmyzFPtJ8XEWY/8oPhE38YzikywlbimeYjrxeYAtSCe+VmwxntBcVoyJxA+K40wkflY8xtnEb4qPMZ74S3GCyeSY4iSTydOKU0wmZxTbTCZXFKeZTq4pPo5JthSPM5f8UvEJMsnvFZ/ESSpX1v9YTJ1VPMHlVC/O/7mQ6vU1ydupbxW/EKn5FOdTfyh+MdL76UjvL0VynYnkmuKknVJ8lnG71+PLEd9XOGWfV/wqaXtZ8bmI72uM2+8qNkzYvfpfD2fDOs+k/YniN0jbDcXTkThvRmp4iyX7oeKLzNrfKZ7FsXVmrDnm0j2N5iN5HTJpnRNrIVJDhpn0p4oXmU1/ofhapN9V4fAbDItkyJLBMK+rRVnlKNNkmwqGAvt08KhQp4MhT4MSTdq05P+i7JUxzLCLh0eLFRZY4IH8cyj2ozniWWeBi8xheICLxy6GTSp0qNDmvka7QZMGHoZ1itT9WswZCjTp0qZExUzhRNcYrtGkLOgWbZpcpUmNMlkc6fQyV8ixylU2uDLg2/MM/Ob7nofHN327j6T2Dq5UbQYy7tLEk84b3O/vOWTJsswV6hTZoyJWO1R4KBkWcbiEwzKXWJZYz16vK4oVMXiiVFlULNJmD0OTnefW2pUufe18v9s0RMlgr4CnlkH2BmUWxN9Ij7vClZHIXdG4jSvWznNVc4siXWoYVnEw3NSo/oRtCa/+365Mnl93hcYzTKrHPi0qbLGrfIaTWRAOPR4IpyHjNVxRoCEz7XPSFRaCvnusFcizhmFD4jcGIq8NRPA7GTVhWek3rGwwb6j/fYq41CiyTU12wpNXlLw5PhTssYIZYqdDSRRq4YlGHYnliAZVFtjgBmtDlRzNUVn+Btpv0+1PT9CdPzX+ec9REOULZkpOW05YKwgjd8izxU02uM2WrHNsskmOdbbIc118N9iUk7vBOqvikRcc7N2QE7DOxxjeJy82fuyK8hMo5p/JllTfkdqDWXap0xLO/cod6bUiHT67woYdjdrz7YhPCZcdsTSiX4MqXYpUdSpaUmFduOzNRnjqgomoSy++tuF+labctG05uX5Uw77eHf60BjUFN4T3FKo6zzUz/32jbcrp87sIUV66CGa802e/It0Orqv6lrhynwb3leGC8FGQ18TFWO9Rkuy+r8+FiT964svjJ74ciMpttnGDKY0fcI19yVbT6gzbwop4cDf2K/foiH4dUdev6DOJ4t9Nd8lwT++ZJlW52VrCeUnO4r6sgvm5y/whtkW9L9ui157Yz47IXZbXoibaGemtqtGnuSccezobwR1raNCVN7gtu8Epld7IHlrPcKSO9jCndQ2qOCevwrAmw9qOsnosX4eUGcsOqD3K70B+eVTl/fDZuCMnvyrTfJ2H+m6u9b+F6APh0hVeCvJG+fdY8AqHnr13+arEL7E3cubDGZ8fmfUon6e3HOz2KOvBHg+3HebgKPtRv1hG2ylz/wIAAP//AQAA///7vB6iAAADAAAAAAAA/7UAMgAAAAEAAAAAAAAAAAAAAAAAAAAAuAH/hbAEjQA=");
-}
-    @media (prefers-color-scheme: dark) {
-      .d2-2851460652 .fill-N1 { fill: #E8E8E8; }
-      .d2-2851460652 .fill-N2 { fill: #CCCCCC; }
-      .d2-2851460652 .fill-N3 { fill: #999999; }
-      .d2-2851460652 .fill-B1 { fill: #E8E8E8; }
-      .d2-2851460652 .fill-B2 { fill: #7BACFF; }
-      .d2-2851460652 .fill-B3 { fill: #6AAFDC; }
-      .d2-2851460652 .fill-N4 { fill: #3A3A44; }
-      .d2-2851460652 .fill-N5 { fill: #2E2E38; }
-      .d2-2851460652 .fill-N6 { fill: #252530; }
-      .d2-2851460652 .fill-N7 { fill: transparent; }
-      .d2-2851460652 .fill-B4 { fill: #2A2A34; }
-      .d2-2851460652 .fill-B5 { fill: #1E1E28; }
-      .d2-2851460652 .fill-B6 { fill: #16161E; }
-      .d2-2851460652 .stroke-N1 { stroke: #E8E8E8; }
-      .d2-2851460652 .stroke-N2 { stroke: #CCCCCC; }
-      .d2-2851460652 .stroke-N3 { stroke: #999999; }
-      .d2-2851460652 .stroke-N4 { stroke: #555555; }
-      .d2-2851460652 .stroke-N5 { stroke: #444444; }
-      .d2-2851460652 .stroke-N6 { stroke: #333333; }
-      .d2-2851460652 .stroke-N7 { stroke: transparent; }
-      .d2-2851460652 .stroke-B1 { stroke: #E8E8E8; }
-      .d2-2851460652 .stroke-B2 { stroke: #7BACFF; }
-      .d2-2851460652 .stroke-B3 { stroke: #6AAFDC; }
-      .d2-2851460652 .stroke-B4 { stroke: #3A3A44; }
-      .d2-2851460652 .stroke-B5 { stroke: #2E2E38; }
-      .d2-2851460652 .stroke-B6 { stroke: #252530; }
-      .d2-2851460652 .color-N1 { color: #E8E8E8; }
-      .d2-2851460652 .color-N2 { color: #CCCCCC; }
-      .d2-2851460652 .color-N3 { color: #999999; }
-      .d2-2851460652 .connection.fill-B1 { fill: #E8E8E8; }
-    }
-    ]]></style><style type="text/css"><![CDATA[.shape {
-  shape-rendering: geometricPrecision;
-  stroke-linejoin: round;
-}
-.connection {
-  stroke-linecap: round;
-  stroke-linejoin: round;
-}
-.blend {
-  mix-blend-mode: multiply;
-  opacity: 0.5;
-}
-
-		.d2-2851460652 .fill-N1{fill:#000410;}
-		.d2-2851460652 .fill-N2{fill:#0000B8;}
-		.d2-2851460652 .fill-N3{fill:#9499AB;}
-		.d2-2851460652 .fill-N4{fill:#CFD2DD;}
-		.d2-2851460652 .fill-N5{fill:#C3DEF3;}
-		.d2-2851460652 .fill-N6{fill:#EEF1F8;}
-		.d2-2851460652 .fill-N7{fill:#FFFFFF;}
-		.d2-2851460652 .fill-B1{fill:#000410;}
-		.d2-2851460652 .fill-B2{fill:#0000E4;}
-		.d2-2851460652 .fill-B3{fill:#5AA4DC;}
-		.d2-2851460652 .fill-B4{fill:#E7E9EE;}
-		.d2-2851460652 .fill-B5{fill:#F5F6F9;}
-		.d2-2851460652 .fill-B6{fill:#FFFFFF;}
-		.d2-2851460652 .fill-AA2{fill:#008566;}
-		.d2-2851460652 .fill-AA4{fill:#45BBA5;}
-		.d2-2851460652 .fill-AA5{fill:#7ACCBD;}
-		.d2-2851460652 .fill-AB4{fill:#F1C759;}
-		.d2-2851460652 .fill-AB5{fill:#F9E088;}
-		.d2-2851460652 .stroke-N1{stroke:#000410;}
-		.d2-2851460652 .stroke-N2{stroke:#0000B8;}
-		.d2-2851460652 .stroke-N3{stroke:#9499AB;}
-		.d2-2851460652 .stroke-N4{stroke:#CFD2DD;}
-		.d2-2851460652 .stroke-N5{stroke:#C3DEF3;}
-		.d2-2851460652 .stroke-N6{stroke:#EEF1F8;}
-		.d2-2851460652 .stroke-N7{stroke:#FFFFFF;}
-		.d2-2851460652 .stroke-B1{stroke:#000410;}
-		.d2-2851460652 .stroke-B2{stroke:#0000E4;}
-		.d2-2851460652 .stroke-B3{stroke:#5AA4DC;}
-		.d2-2851460652 .stroke-B4{stroke:#E7E9EE;}
-		.d2-2851460652 .stroke-B5{stroke:#F5F6F9;}
-		.d2-2851460652 .stroke-B6{stroke:#FFFFFF;}
-		.d2-2851460652 .stroke-AA2{stroke:#008566;}
-		.d2-2851460652 .stroke-AA4{stroke:#45BBA5;}
-		.d2-2851460652 .stroke-AA5{stroke:#7ACCBD;}
-		.d2-2851460652 .stroke-AB4{stroke:#F1C759;}
-		.d2-2851460652 .stroke-AB5{stroke:#F9E088;}
-		.d2-2851460652 .background-color-N1{background-color:#000410;}
-		.d2-2851460652 .background-color-N2{background-color:#0000B8;}
-		.d2-2851460652 .background-color-N3{background-color:#9499AB;}
-		.d2-2851460652 .background-color-N4{background-color:#CFD2DD;}
-		.d2-2851460652 .background-color-N5{background-color:#C3DEF3;}
-		.d2-2851460652 .background-color-N6{background-color:#EEF1F8;}
-		.d2-2851460652 .background-color-N7{background-color:#FFFFFF;}
-		.d2-2851460652 .background-color-B1{background-color:#000410;}
-		.d2-2851460652 .background-color-B2{background-color:#0000E4;}
-		.d2-2851460652 .background-color-B3{background-color:#5AA4DC;}
-		.d2-2851460652 .background-color-B4{background-color:#E7E9EE;}
-		.d2-2851460652 .background-color-B5{background-color:#F5F6F9;}
-		.d2-2851460652 .background-color-B6{background-color:#FFFFFF;}
-		.d2-2851460652 .background-color-AA2{background-color:#008566;}
-		.d2-2851460652 .background-color-AA4{background-color:#45BBA5;}
-		.d2-2851460652 .background-color-AA5{background-color:#7ACCBD;}
-		.d2-2851460652 .background-color-AB4{background-color:#F1C759;}
-		.d2-2851460652 .background-color-AB5{background-color:#F9E088;}
-		.d2-2851460652 .color-N1{color:#000410;}
-		.d2-2851460652 .color-N2{color:#0000B8;}
-		.d2-2851460652 .color-N3{color:#9499AB;}
-		.d2-2851460652 .color-N4{color:#CFD2DD;}
-		.d2-2851460652 .color-N5{color:#C3DEF3;}
-		.d2-2851460652 .color-N6{color:#EEF1F8;}
-		.d2-2851460652 .color-N7{color:#FFFFFF;}
-		.d2-2851460652 .color-B1{color:#000410;}
-		.d2-2851460652 .color-B2{color:#0000E4;}
-		.d2-2851460652 .color-B3{color:#5AA4DC;}
-		.d2-2851460652 .color-B4{color:#E7E9EE;}
-		.d2-2851460652 .color-B5{color:#F5F6F9;}
-		.d2-2851460652 .color-B6{color:#FFFFFF;}
-		.d2-2851460652 .color-AA2{color:#008566;}
-		.d2-2851460652 .color-AA4{color:#45BBA5;}
-		.d2-2851460652 .color-AA5{color:#7ACCBD;}
-		.d2-2851460652 .color-AB4{color:#F1C759;}
-		.d2-2851460652 .color-AB5{color:#F9E088;}.appendix text.text{fill:#000410}.md{--color-fg-default:#000410;--color-fg-muted:#0000B8;--color-fg-subtle:#9499AB;--color-canvas-default:#FFFFFF;--color-canvas-subtle:#EEF1F8;--color-border-default:#000410;--color-border-muted:#0000E4;--color-neutral-muted:#EEF1F8;--color-accent-fg:#0000E4;--color-accent-emphasis:#0000E4;--color-attention-subtle:#0000B8;--color-danger-fg:red;}.sketch-overlay-B1{fill:url(#streaks-darker-d2-2851460652);mix-blend-mode:lighten}.sketch-overlay-B2{fill:url(#streaks-darker-d2-2851460652);mix-blend-mode:lighten}.sketch-overlay-B3{fill:url(#streaks-normal-d2-2851460652);mix-blend-mode:color-burn}.sketch-overlay-B4{fill:url(#streaks-bright-d2-2851460652);mix-blend-mode:darken}.sketch-overlay-B5{fill:url(#streaks-bright-d2-2851460652);mix-blend-mode:darken}.sketch-overlay-B6{fill:url(#streaks-bright-d2-2851460652);mix-blend-mode:darken}.sketch-overlay-AA2{fill:url(#streaks-dark-d2-2851460652);mix-blend-mode:overlay}.sketch-overlay-AA4{fill:url(#streaks-normal-d2-2851460652);mix-blend-mode:color-burn}.sketch-overlay-AA5{fill:url(#streaks-normal-d2-2851460652);mix-blend-mode:color-burn}.sketch-overlay-AB4{fill:url(#streaks-normal-d2-2851460652);mix-blend-mode:color-burn}.sketch-overlay-AB5{fill:url(#streaks-normal-d2-2851460652);mix-blend-mode:color-burn}.sketch-overlay-N1{fill:url(#streaks-darker-d2-2851460652);mix-blend-mode:lighten}.sketch-overlay-N2{fill:url(#streaks-darker-d2-2851460652);mix-blend-mode:lighten}.sketch-overlay-N3{fill:url(#streaks-normal-d2-2851460652);mix-blend-mode:color-burn}.sketch-overlay-N4{fill:url(#streaks-normal-d2-2851460652);mix-blend-mode:color-burn}.sketch-overlay-N5{fill:url(#streaks-normal-d2-2851460652);mix-blend-mode:color-burn}.sketch-overlay-N6{fill:url(#streaks-bright-d2-2851460652);mix-blend-mode:darken}.sketch-overlay-N7{fill:url(#streaks-bright-d2-2851460652);mix-blend-mode:darken}.light-code{display: block}.dark-code{display: none}
-    @media (prefers-color-scheme: dark) {
-      .d2-2851460652 .fill-N1 { fill: #E8E8E8; }
-      .d2-2851460652 .fill-N2 { fill: #CCCCCC; }
-      .d2-2851460652 .fill-N3 { fill: #999999; }
-      .d2-2851460652 .fill-B1 { fill: #E8E8E8; }
-      .d2-2851460652 .fill-B2 { fill: #7BACFF; }
-      .d2-2851460652 .fill-B3 { fill: #6AAFDC; }
-      .d2-2851460652 .fill-N4 { fill: #3A3A44; }
-      .d2-2851460652 .fill-N5 { fill: #2E2E38; }
-      .d2-2851460652 .fill-N6 { fill: #252530; }
-      .d2-2851460652 .fill-N7 { fill: transparent; }
-      .d2-2851460652 .fill-B4 { fill: #2A2A34; }
-      .d2-2851460652 .fill-B5 { fill: #1E1E28; }
-      .d2-2851460652 .fill-B6 { fill: #16161E; }
-      .d2-2851460652 .stroke-N1 { stroke: #E8E8E8; }
-      .d2-2851460652 .stroke-N2 { stroke: #CCCCCC; }
-      .d2-2851460652 .stroke-N3 { stroke: #999999; }
-      .d2-2851460652 .stroke-N4 { stroke: #555555; }
-      .d2-2851460652 .stroke-N5 { stroke: #444444; }
-      .d2-2851460652 .stroke-N6 { stroke: #333333; }
-      .d2-2851460652 .stroke-N7 { stroke: transparent; }
-      .d2-2851460652 .stroke-B1 { stroke: #E8E8E8; }
-      .d2-2851460652 .stroke-B2 { stroke: #7BACFF; }
-      .d2-2851460652 .stroke-B3 { stroke: #6AAFDC; }
-      .d2-2851460652 .stroke-B4 { stroke: #3A3A44; }
-      .d2-2851460652 .stroke-B5 { stroke: #2E2E38; }
-      .d2-2851460652 .stroke-B6 { stroke: #252530; }
-      .d2-2851460652 .color-N1 { color: #E8E8E8; }
-      .d2-2851460652 .color-N2 { color: #CCCCCC; }
-      .d2-2851460652 .color-N3 { color: #999999; }
-      .d2-2851460652 .connection.fill-B1 { fill: #E8E8E8; }
-    }
-    ]]></style><style type="text/css"><![CDATA[
-.dots-overlay {
-	fill: url(#dots-d2-2851460652);
-	mix-blend-mode: multiply;
-}
-    @media (prefers-color-scheme: dark) {
-      .d2-2851460652 .fill-N1 { fill: #E8E8E8; }
-      .d2-2851460652 .fill-N2 { fill: #CCCCCC; }
-      .d2-2851460652 .fill-N3 { fill: #999999; }
-      .d2-2851460652 .fill-B1 { fill: #E8E8E8; }
-      .d2-2851460652 .fill-B2 { fill: #7BACFF; }
-      .d2-2851460652 .fill-B3 { fill: #6AAFDC; }
-      .d2-2851460652 .fill-N4 { fill: #3A3A44; }
-      .d2-2851460652 .fill-N5 { fill: #2E2E38; }
-      .d2-2851460652 .fill-N6 { fill: #252530; }
-      .d2-2851460652 .fill-N7 { fill: transparent; }
-      .d2-2851460652 .fill-B4 { fill: #2A2A34; }
-      .d2-2851460652 .fill-B5 { fill: #1E1E28; }
-      .d2-2851460652 .fill-B6 { fill: #16161E; }
-      .d2-2851460652 .stroke-N1 { stroke: #E8E8E8; }
-      .d2-2851460652 .stroke-N2 { stroke: #CCCCCC; }
-      .d2-2851460652 .stroke-N3 { stroke: #999999; }
-      .d2-2851460652 .stroke-N4 { stroke: #555555; }
-      .d2-2851460652 .stroke-N5 { stroke: #444444; }
-      .d2-2851460652 .stroke-N6 { stroke: #333333; }
-      .d2-2851460652 .stroke-N7 { stroke: transparent; }
-      .d2-2851460652 .stroke-B1 { stroke: #E8E8E8; }
-      .d2-2851460652 .stroke-B2 { stroke: #7BACFF; }
-      .d2-2851460652 .stroke-B3 { stroke: #6AAFDC; }
-      .d2-2851460652 .stroke-B4 { stroke: #3A3A44; }
-      .d2-2851460652 .stroke-B5 { stroke: #2E2E38; }
-      .d2-2851460652 .stroke-B6 { stroke: #252530; }
-      .d2-2851460652 .color-N1 { color: #E8E8E8; }
-      .d2-2851460652 .color-N2 { color: #CCCCCC; }
-      .d2-2851460652 .color-N3 { color: #999999; }
-      .d2-2851460652 .connection.fill-B1 { fill: #E8E8E8; }
-    }
-    ]]></style><defs><pattern id="dots-d2-2851460652" x="0" y="0" width="15" height="15" patternUnits="userSpaceOnUse">
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="2" y="2" width="1" height="1" fill="#0A0F25"/>
-</g>
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="12" y="2" width="1" height="1" fill="#0A0F25"/>
-</g>
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="12" y="12" width="1" height="1" fill="#0A0F25"/>
-</g>
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="2" y="12" width="1" height="1" fill="#0A0F25"/>
-</g>
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="2" y="7" width="1" height="1" fill="#0A0F25"/>
-</g>
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="12" y="7" width="1" height="1" fill="#0A0F25"/>
-</g>
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="7" y="2" width="1" height="1" fill="#0A0F25"/>
-</g>
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="7" y="12" width="1" height="1" fill="#0A0F25"/>
-</g>
-<g style="mix-blend-mode:multiply" opacity="0.1">
-<rect x="7" y="7" width="1" height="1" fill="#0A0F25"/>
-</g>
-</pattern>
-</defs><g class="YWlj"><g class="shape" ><rect x="12.000000" y="12.000000" width="380.000000" height="485.000000" stroke="#000410" fill="#E7E9EE" class=" stroke-B1 fill-B4" style="stroke-width:2;" /><rect x="12.000000" y="12.000000" width="380.000000" height="485.000000" class="dots-overlay" style="stroke-width:2;" /><rect x="17.000000" y="17.000000" width="370.000000" height="475.000000" stroke="#000410" fill="transparent" class=" stroke-B1" style="stroke-width:2;" /></g><text x="202.000000" y="45.000000" fill="#000410" class="text-mono fill-N1" style="text-anchor:middle;font-size:28px">AIC OUTPUT</text></g><g class="YWlwZXJm"><g class="shape" ><rect x="472.000000" y="12.000000" width="420.000000" height="485.000000" stroke="#000410" fill="#E7E9EE" class=" stroke-B1 fill-B4" style="stroke-width:2;" /><rect x="472.000000" y="12.000000" width="420.000000" height="485.000000" class="dots-overlay" style="stroke-width:2;" /><rect x="477.000000" y="17.000000" width="410.000000" height="475.000000" stroke="#000410" fill="transparent" class=" stroke-B1" style="stroke-width:2;" /></g><text x="682.000000" y="45.000000" fill="#000410" class="text-mono fill-N1" style="text-anchor:middle;font-size:28px">AIPERF ARGUMENT</text></g><g class="YWljLmMx"><g class="shape" ><rect x="62.000000" y="62.000000" width="280.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="202.000000" y="98.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">CONCURRENCY: 56 (=14X4)</text></g><g class="YWljLmMy"><g class="shape" ><rect x="62.000000" y="143.000000" width="280.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="202.000000" y="179.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">ISL: 4000, OSL: 500</text></g><g class="YWljLmMz"><g class="shape" ><rect x="62.000000" y="224.000000" width="280.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="202.000000" y="260.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">MODEL: QWEN3-32B-FP8</text></g><g class="YWljLmM0"><g class="shape" ><rect x="62.000000" y="305.000000" width="280.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="202.000000" y="341.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">CONCURRENCY X ~14</text></g><g class="YWljLmM1"><g class="shape" ><rect x="62.000000" y="386.000000" width="280.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="202.000000" y="422.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">(BEST PRACTICE)</text></g><g class="YWlwZXJmLmEx"><g class="shape" ><rect x="522.000000" y="62.000000" width="320.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="682.000000" y="98.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">--CONCURRENCY 56</text></g><g class="YWlwZXJmLmEy"><g class="shape" ><rect x="522.000000" y="143.000000" width="320.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="682.000000" y="179.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">--ISL 4000 --OSL 500</text></g><g class="YWlwZXJmLmEz"><g class="shape" ><rect x="522.000000" y="224.000000" width="320.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="682.000000" y="260.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">-M QWEN/QWEN3-32B-FP8</text></g><g class="YWlwZXJmLmE0"><g class="shape" ><rect x="522.000000" y="305.000000" width="320.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="682.000000" y="341.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">--NUM-REQUESTS 800</text></g><g class="YWlwZXJmLmE1"><g class="shape" ><rect x="522.000000" y="386.000000" width="320.000000" height="61.000000" stroke="#000410" fill="#F5F6F9" class=" stroke-B1 fill-B5" style="stroke-width:2;" /></g><text x="682.000000" y="422.000000" fill="#000410" class="text-mono-bold fill-N1" style="text-anchor:middle;font-size:16px">--EXTRA-INPUTS &#34;IGNORE_EOS:TRUE&#34;</text></g><g class="KGFpYy5jMSAtJmd0OyBhaXBlcmYuYTEpWzBd"><marker id="mk-d2-2851460652-3488378134" markerWidth="10.000000" markerHeight="12.000000" refX="7.000000" refY="6.000000" viewBox="0.000000 0.000000 10.000000 12.000000" orient="auto" markerUnits="userSpaceOnUse"> <polygon points="0.000000,0.000000 10.000000,6.000000 0.000000,12.000000" fill="#000410" class="connection fill-B1" stroke-width="2" /> </marker><path d="M 344.000000 92.500000 L 518.000000 92.500000" stroke="#000410" fill="none" class="connection stroke-B1" style="stroke-width:2;" marker-end="url(#mk-d2-2851460652-3488378134)" mask="url(#d2-2851460652)" /></g><g class="KGFpYy5jMiAtJmd0OyBhaXBlcmYuYTIpWzBd"><path d="M 344.000000 173.500000 L 518.000000 173.500000" stroke="#000410" fill="none" class="connection stroke-B1" style="stroke-width:2;" marker-end="url(#mk-d2-2851460652-3488378134)" mask="url(#d2-2851460652)" /></g><g class="KGFpYy5jMyAtJmd0OyBhaXBlcmYuYTMpWzBd"><path d="M 344.000000 254.500000 L 518.000000 254.500000" stroke="#000410" fill="none" class="connection stroke-B1" style="stroke-width:2;" marker-end="url(#mk-d2-2851460652-3488378134)" mask="url(#d2-2851460652)" /></g><g class="KGFpYy5jNCAtJmd0OyBhaXBlcmYuYTQpWzBd"><path d="M 344.000000 335.500000 L 518.000000 335.500000" stroke="#000410" fill="none" class="connection stroke-B1" style="stroke-width:2;" marker-end="url(#mk-d2-2851460652-3488378134)" mask="url(#d2-2851460652)" /></g><g class="KGFpYy5jNSAtJmd0OyBhaXBlcmYuYTUpWzBd"><path d="M 344.000000 416.500000 L 518.000000 416.500000" stroke="#000410" fill="none" class="connection stroke-B1" style="stroke-width:2;" marker-end="url(#mk-d2-2851460652-3488378134)" mask="url(#d2-2851460652)" /></g><mask id="d2-2851460652" maskUnits="userSpaceOnUse" x="-29" y="-29" width="962" height="567">
-<rect x="-29" y="-29" width="962" height="567" fill="white"></rect>
-
-</mask></svg></svg>
--- a/docs/images/pd_interpolation.png
+++ b/docs/images/pd_interpolation.png
--- a/docs/images/planner_perf.png
+++ b/docs/images/planner_perf.png
--- a/docs/images/planner_tensorboard.png
+++ b/docs/images/planner_tensorboard.png
--- a/docs/images/prefill_time.png
+++ b/docs/images/prefill_time.png
--- a/docs/index.rst
+++ b/docs/index.rst
-..
-    SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-    SPDX-License-Identifier: Apache-2.0
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-..
-   Main Page
-..
-
-Welcome to NVIDIA Dynamo
-========================
-
-The NVIDIA Dynamo Platform is a high-performance, low-latency inference framework designed to serve all AI models—across any framework, architecture, or deployment scale.
-
-.. admonition:: 💎 Discover the latest developments!
-   :class: seealso
-
-   This guide is a snapshot of a specific point in time. For the latest information, examples, and Release Assets, see the `Dynamo GitHub repository <https://github.com/ai-dynamo/dynamo/releases/latest>`_.
-
-Quickstart
-==========
-.. include:: _includes/quick_start_local.rst
-
-..
-   Sidebar
-..
-
-.. toctree::
-   :hidden:
-   :caption: Getting Started
-
-   Quickstart <self>
-   Support Matrix <reference/support-matrix.md>
-   Feature Matrix <reference/feature-matrix.md>
-   Release Artifacts <reference/release-artifacts.md>
-   Examples <_sections/examples>
-
-.. toctree::
-   :hidden:
-   :caption: Kubernetes Deployment
-
-   Deployment Guide <kubernetes/README>
-   Observability (K8s) <kubernetes/observability/metrics>
-   Multinode <kubernetes/deployment/multinode-deployment>
-
-.. toctree::
-   :hidden:
-   :caption: User Guides
-
-   KV Cache Aware Routing <components/router/router_guide.md>
-   Disaggregated Serving Guide <features/disaggregated_serving/README.md>
-   KV Cache Offloading <components/kvbm/kvbm_guide.md>
-   Benchmarking <benchmarks/benchmarking.md>
-   Multimodality Support <features/multimodal/README.md>
-   Tool Calling <agents/tool-calling.md>
-   LoRA Adapters <features/lora/README.md>
-   Observability (Local) <observability/README>
-   Fault Tolerance <fault_tolerance/README>
-   Writing Python Workers in Dynamo <development/backend-guide.md>
-
-.. toctree::
-   :hidden:
-   :caption: Components
-
-   Backends <_sections/backends>
-   Frontend <components/frontend/README>
-   Router <components/router/README>
-   Planner <components/planner/README>
-   Profiler <components/profiler/README>
-   KVBM <components/kvbm/README>
-
-.. toctree::
-   :hidden:
-   :caption: Integrations
-
-   LMCache <integrations/lmcache_integration.md>
-   SGLang HiCache <integrations/sglang_hicache.md>
-   FlexKV <integrations/flexkv_integration.md>
-   KV Events for Custom Engines <integrations/kv_events_custom_engines.md>
-
-.. toctree::
-   :hidden:
-   :caption: Design Docs
-
-   Overall Architecture <design_docs/architecture.md>
-   Architecture Flow <design_docs/dynamo_flow.md>
-   Disaggregated Serving <design_docs/disagg_serving.md>
-   Distributed Runtime <design_docs/distributed_runtime.md>
-   Request Plane <design_docs/request_plane.md>
-   Event Plane <design_docs/event_plane.md>
-   Router Design <design_docs/router_design.md>
-   KVBM Design <design_docs/kvbm_design.md>
-   Planner Design <design_docs/planner_design.md>
--- a/docs/integrations/flexkv_integration.md
+++ b/docs/integrations/flexkv_integration.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# FlexKV Integration in Dynamo
-
-## Introduction
-
-[FlexKV](https://github.com/taco-project/FlexKV) is a scalable, distributed runtime for KV cache offloading developed by Tencent Cloud's TACO team in collaboration with the community. It acts as a unified KV caching layer for inference engines like vLLM, TensorRT-LLM, and SGLang.
-
-### Key Features
-
- **Multi-level caching**: CPU memory, local SSD, and scalable storage (cloud storage) for KV cache offloading
- **Distributed KV cache reuse**: Share KV cache across multiple nodes using distributed RadixTree
- **High-performance I/O**: Supports io_uring and GPU Direct Storage (GDS) for accelerated data transfer
- **Asynchronous operations**: Get and put operations can overlap with computation through prefetching
-
-
-## Prerequisites
-
-1. **Dynamo installed** with vLLM support
-2. **Infrastructure services running**:
-   ```bash
-   docker compose -f deploy/docker-compose.yml up -d
-   ```
-3. **FlexKV dependencies** (for SSD offloading):
-   ```bash
-   apt install liburing-dev libxxhash-dev
-   ```
-
-## Quick Start
-
-### Enable FlexKV
-
-Set the `DYNAMO_USE_FLEXKV` environment variable and use the `--connector flexkv` flag:
-
-```bash
-export DYNAMO_USE_FLEXKV=1
-python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector flexkv
-```
-
-## Aggregated Serving
-
-### Basic Setup
-
-```bash
-# Terminal 1: Start frontend
-python -m dynamo.frontend &
-
-# Terminal 2: Start vLLM worker with FlexKV
-DYNAMO_USE_FLEXKV=1 \
-FLEXKV_CPU_CACHE_GB=32 \
-  python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector flexkv
-```
-
-### With KV-Aware Routing
-
-For multi-worker deployments with KV-aware routing to maximize cache reuse:
-
-```bash
-# Terminal 1: Start frontend with KV router
-python -m dynamo.frontend \
-    --router-mode kv \
-    --router-reset-states &
-
-# Terminal 2: Worker 1
-DYNAMO_USE_FLEXKV=1 \
-FLEXKV_CPU_CACHE_GB=32 \
-FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_0" \
-CUDA_VISIBLE_DEVICES=0 \
-python -m dynamo.vllm \
-    --model Qwen/Qwen3-0.6B \
-    --connector flexkv \
-    --gpu-memory-utilization 0.2 \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
-
-# Terminal 3: Worker 2
-DYNAMO_USE_FLEXKV=1 \
-FLEXKV_CPU_CACHE_GB=32 \
-FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_1" \
-CUDA_VISIBLE_DEVICES=1 \
-python -m dynamo.vllm \
-    --model Qwen/Qwen3-0.6B \
-    --connector flexkv \
-    --gpu-memory-utilization 0.2 \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
-```
-
-## Disaggregated Serving
-
-FlexKV can be used with disaggregated prefill/decode serving. The prefill worker uses FlexKV for KV cache offloading, while NIXL handles KV transfer between prefill and decode workers.
-
-```bash
-# Terminal 1: Start frontend
-python -m dynamo.frontend &
-
-# Terminal 2: Decode worker (without FlexKV)
-CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl &
-
-# Terminal 3: Prefill worker (with FlexKV)
-DYN_VLLM_KV_EVENT_PORT=20081 \
-VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
-DYNAMO_USE_FLEXKV=1 \
-FLEXKV_CPU_CACHE_GB=32 \
-CUDA_VISIBLE_DEVICES=1 \
-  python -m dynamo.vllm \
-  --model Qwen/Qwen3-0.6B \
-  --is-prefill-worker \
-  --connector nixl flexkv
-```
-
-## Configuration
-
-### Environment Variables
-
-| Variable | Description | Default |
-|----------|-------------|---------|
-| `DYNAMO_USE_FLEXKV` | Enable FlexKV integration | `0` (disabled) |
-| `FLEXKV_CPU_CACHE_GB` | CPU memory cache size in GB | Required |
-| `FLEXKV_CONFIG_PATH` | Path to FlexKV YAML config file | Not set |
-| `FLEXKV_SERVER_RECV_PORT` | IPC port for FlexKV server | Auto |
-
-### CPU-Only Offloading
-
-For simple CPU memory offloading:
-
-```bash
-unset FLEXKV_CONFIG_PATH
-export FLEXKV_CPU_CACHE_GB=32
-```
-
-### CPU + SSD Tiered Offloading
-
-For multi-tier offloading with SSD storage, create a configuration file:
-
-```bash
-cat > ./flexkv_config.yml <<EOF
-cpu_cache_gb: 32
-ssd_cache_gb: 1024
-ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/
-enable_gds: false
-EOF
-
-export FLEXKV_CONFIG_PATH="./flexkv_config.yml"
-```
-
-### Configuration Options
-
-| Option | Description |
-|--------|-------------|
-| `cpu_cache_gb` | CPU memory cache size in GB |
-| `ssd_cache_gb` | SSD cache size in GB |
-| `ssd_cache_dir` | SSD cache directories (semicolon-separated for multiple SSDs) |
-| `enable_gds` | Enable GPU Direct Storage for SSD I/O |
-
-> **Note:** For full configuration options, see the [FlexKV Configuration Reference](https://github.com/taco-project/FlexKV/blob/main/docs/flexkv_config_reference/README_en.md).
-
-## Distributed KV Cache Reuse
-
-FlexKV supports distributed KV cache reuse to share cache across multiple nodes. This enables:
-
- **Distributed RadixTree**: Each node maintains a local snapshot of the global index
- **Lease Mechanism**: Ensures data validity during cross-node transfers
- **RDMA-based Transfer**: Uses Mooncake Transfer Engine for high-performance KV cache transfer
-
-For setup instructions, see the [FlexKV Distributed Reuse Guide](https://github.com/taco-project/FlexKV/blob/main/docs/dist_reuse/README_en.md).
-
-## Architecture
-
-FlexKV consists of three core modules:
-
-### StorageEngine
-
-Initializes the three-level cache (GPU → CPU → SSD/Cloud). It groups multiple tokens into blocks and stores KV cache at the block level, maintaining the same KV shape as in GPU memory.
-
-### GlobalCacheEngine
-
-The control plane that determines data transfer direction and identifies source/destination block IDs. Includes:
- RadixTree for prefix matching
- Memory pool to track space usage and trigger eviction
-
-### TransferEngine
-
-The data plane that executes data transfers:
- Multi-threading for parallel transfers
- High-performance I/O (io_uring, GDS)
- Asynchronous operations overlapping with computation
-
-## Verify Deployment
-
-```bash
-curl localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "Qwen/Qwen3-0.6B",
-    "messages": [{"role": "user", "content": "Hello!"}],
-    "stream": false,
-    "max_tokens": 30
-  }'
-```
-
-## See Also
-
- [FlexKV GitHub Repository](https://github.com/taco-project/FlexKV)
- [FlexKV vLLM Adapter Documentation](https://github.com/taco-project/FlexKV/blob/main/docs/vllm_adapter/README_en.md)
-
--- a/docs/integrations/kv_events_custom_engines.md
+++ b/docs/integrations/kv_events_custom_engines.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-->
-
-# KV Event Publishing for Custom Engines
-
-This document explains how to implement KV event publishing for custom inference engines, enabling them to participate in Dynamo's KV cache-aware routing.
-
-## Overview
-
-The KV Router relies on real-time events from backend workers to track which KV cache blocks are stored on each worker. When your custom engine allocates or evicts KV cache blocks, it should publish these events so the router can make optimal routing decisions.
-
-There are two main publishing pathways:
-
-1. **Direct NATS publishing** (`KvEventPublisher`) - Publishes events directly to NATS. Simplest approach for custom engines.
-2. **ZMQ-based publishing** - For engines with ZMQ event output (like vLLM). Uses a ZMQ publisher in the engine and `ZmqKvEventPublisher` to forward events to NATS.
-
-## Event Types
-
-The KV cache supports three event types:
-
-| Event Type | Description | When to Publish |
-|------------|-------------|-----------------|
-| `BlockStored` | New blocks added to cache | After KV cache allocation succeeds |
-| `BlockRemoved` | Blocks evicted from cache | When blocks are evicted or freed |
-| `AllBlocksCleared` | All blocks removed | On cache reset or worker restart |
-
-### Event Structure
-
-Each event contains:
- **`event_id`**: Monotonically increasing identifier per worker
- **`dp_rank`**: Data parallel rank (0 if DP not enabled)
- **`data`**: One of `Stored`, `Removed`, or `Cleared`
-
-For `BlockStored` events:
- **`token_ids`**: List of token IDs for the stored blocks
- **`block_hashes`**: List of **sequence block hashes** from the engine's block manager. These are cumulative hashes that incorporate all tokens from the start of the sequence up to and including the current block (not just the tokens within that block). This enables prefix matching across requests.
- **`num_block_tokens`**: Number of tokens per block (should all equal `kv_block_size`)
- **`parent_hash`**: Hash of the parent block. Required for all blocks except the first block in a sequence (which has no parent).
- **`lora_id`**: LoRA adapter ID (0 if not using LoRA)
-
-For `BlockRemoved` events:
- **`block_hashes`**: List of sequence block hashes being evicted
-
-## Option 1: Direct NATS Publishing (Recommended)
-
-The `KvEventPublisher` class publishes events directly to NATS. This is the simplest approach for custom engines.
-
-```mermaid
-flowchart LR
-    subgraph Engine["Custom Engine"]
-        cache["KV Cache Manager"]
-    end
-
-    subgraph Worker["Dynamo Worker Process"]
-        pub["KvEventPublisher"]
-    end
-
-    subgraph NATS["NATS"]
-        subject["kv-events subject"]
-    end
-
-    subgraph Router["KV Router"]
-        indexer["KvIndexer"]
-    end
-
-    cache -->|"on_blocks_stored()<br/>on_blocks_removed()"| pub
-    pub -->|"publish to NATS"| subject
-    subject --> indexer
-```
-
-**When to use:**
- Building a custom inference engine from scratch
- Your engine doesn't have a ZMQ-based event system
- You want the simplest integration path
-
-### Basic Setup
-
-```python
-from dynamo.llm import KvEventPublisher
-
-class CustomEnginePublisher:
-    def __init__(self, component, worker_id: int, block_size: int, dp_rank: int = 0):
-        self.block_size = block_size
-        self.event_id = 0
-        self.kv_publisher = KvEventPublisher(
-            component=component,
-            worker_id=worker_id,
-            kv_block_size=block_size,
-            dp_rank=dp_rank,
-        )
-
-    def on_blocks_stored(self, token_ids: list[int], block_hashes: list[int],
-                         lora_id: int = 0, parent_hash: int | None = None):
-        """Call after KV cache blocks are allocated."""
-        self.event_id += 1
-        num_block_tokens = [self.block_size] * len(block_hashes)
-        self.kv_publisher.publish_stored(
-            event_id=self.event_id,
-            token_ids=token_ids,
-            num_block_tokens=num_block_tokens,
-            block_hashes=block_hashes,
-            lora_id=lora_id,
-            parent_hash=parent_hash,
-        )
-
-    def on_blocks_removed(self, block_hashes: list[int]):
-        """Call when KV cache blocks are evicted."""
-        self.event_id += 1
-        self.kv_publisher.publish_removed(event_id=self.event_id, block_hashes=block_hashes)
-```
-
-### Integration with Your Engine
-
-```python
-from dynamo.llm import register_llm
-
-async def main():
-    # Register your engine with Dynamo
-    component, endpoint = await register_llm(
-        model="my-model",
-        generator=my_generate_fn,
-    )
-
-    # Initialize publisher
-    publisher = CustomEnginePublisher(
-        component=component,
-        worker_id=endpoint.connection_id(),
-        block_size=16,  # Match your engine's block size
-    )
-
-    # Hook into your engine's cache events
-    def on_prefill_complete(request_id, token_ids, blocks):
-        block_hashes = [block.hash for block in blocks]
-        publisher.on_blocks_stored(token_ids=token_ids, block_hashes=block_hashes)
-
-    def on_cache_eviction(evicted_blocks):
-        block_hashes = [block.hash for block in evicted_blocks]
-        publisher.on_blocks_removed(block_hashes=block_hashes)
-```
-
-## Option 2: ZMQ-based Publishing
-
-For engines that publish events via ZMQ (like vLLM), this option uses two components that work together:
-
-1. **ZMQ Publisher** (in your engine) - Publishes events to a ZMQ socket
-2. **ZmqKvEventPublisher** (Dynamo binding) - Subscribes to ZMQ and forwards to NATS
-
-```mermaid
-flowchart LR
-    subgraph Engine["Custom Engine / vLLM"]
-        cache["KV Cache Manager"]
-        zmq_pub["ZMQ Publisher<br/>(Pure Python)"]
-    end
-
-    subgraph ZMQ["ZMQ Socket"]
-        socket["tcp://127.0.0.1:5557"]
-    end
-
-    subgraph Worker["Dynamo Worker Process"]
-        zmq_sub["ZmqKvEventPublisher<br/>(Rust bindings)"]
-    end
-
-    subgraph NATS["NATS"]
-        subject["kv-events subject"]
-    end
-
-    subgraph Router["KV Router"]
-        indexer["KvIndexer"]
-    end
-
-    cache --> zmq_pub
-    zmq_pub -->|"PUB"| socket
-    socket -->|"SUB"| zmq_sub
-    zmq_sub --> subject
-    subject --> indexer
-```
-
-**When to use:**
- Your engine already has a ZMQ-based event system (like vLLM)
- You're integrating with a consolidator (like KVBM)
- You want to decouple event publishing from your engine's main loop
-
-### Part 1: ZMQ Subscriber (Dynamo Bindings)
-
-If your engine already publishes to ZMQ, use `KvEventPublisher` with a `ZmqKvEventPublisherConfig` to subscribe and forward to NATS:
-
-```python
-from dynamo.llm import KvEventPublisher, ZmqKvEventPublisherConfig
-
-# Configure the ZMQ subscriber
-config = ZmqKvEventPublisherConfig(
-    worker_id=endpoint.connection_id(),
-    kv_block_size=block_size,
-    zmq_endpoint="tcp://127.0.0.1:5557",  # Where your engine publishes
-    zmq_topic="",                          # Subscribe to all topics
-)
-
-# Create publisher - it automatically subscribes to ZMQ and forwards to NATS
-kv_publisher = KvEventPublisher(
-    component=component,
-    zmq_config=config,
-)
-```
-
-### Part 2: ZMQ Publisher (Pure Python)
-
-If your engine needs to publish to ZMQ (e.g., for consolidator integration), implement the ZMQ protocol:
-
-```python
-import zmq
-import msgpack
-import time
-
-class ZmqKvEventPublisher:
-    """Pure Python ZMQ publisher for KV events (vLLM-compatible format)."""
-
-    def __init__(self, zmq_endpoint: str, kv_block_size: int, topic: str = ""):
-        self.kv_block_size = kv_block_size
-        self.topic = topic
-        self.ctx = zmq.Context()
-        self.socket = self.ctx.socket(zmq.PUB)
-        self.socket.bind(zmq_endpoint)
-        self.sequence = 0
-        self.data_parallel_rank = 0
-
-    def _to_signed_i64(self, value: int | None) -> int | None:
-        if value is None:
-            return None
-        return value - 0x10000000000000000 if value > 0x7FFFFFFFFFFFFFFF else value
-
-    def publish_stored(self, event_id: int, token_ids: list[int], num_block_tokens: list[int],
-                       block_hashes: list[int], lora_id: int = 0, parent_hash: int | None = None):
-        event = {
-            "type": "BlockStored",
-            "block_hashes": [self._to_signed_i64(h) for h in block_hashes],
-            "parent_block_hash": self._to_signed_i64(parent_hash),
-            "token_ids": token_ids,
-            "block_size": self.kv_block_size,
-            "lora_id": lora_id if lora_id != 0 else None,
-        }
-        self._publish_event(event)
-
-    def publish_removed(self, event_id: int, block_hashes: list[int]):
-        event = {"type": "BlockRemoved", "block_hashes": [self._to_signed_i64(h) for h in block_hashes]}
-        self._publish_event(event)
-
-    def publish_all_cleared(self):
-        self._publish_event({"type": "AllBlocksCleared"})
-
-    def _publish_event(self, event: dict):
-        batch = [time.time(), [event], self.data_parallel_rank]
-        payload = msgpack.packb(batch, use_bin_type=True)
-        sequence_bytes = self.sequence.to_bytes(8, byteorder="big")
-        self.sequence += 1
-        self.socket.send_multipart([self.topic.encode(), sequence_bytes, payload])
-
-    def shutdown(self):
-        self.socket.close()
-        self.ctx.term()
-```
-
-### ZMQ Wire Format
-
-The ZMQ message format (compatible with vLLM):
-
-| Frame | Description |
-|-------|-------------|
-| 1 | Topic (empty string for all topics) |
-| 2 | Sequence number (8 bytes, big-endian) |
-| 3 | Msgpack payload: `[timestamp, [events], dp_rank]` |
-
-Each event in the payload is a dictionary with `type` field (`BlockStored`, `BlockRemoved`, or `AllBlocksCleared`).
-
-## Best Practices
-
-1. **Event IDs must be monotonically increasing** per worker (use a thread-safe counter)
-
-2. **Block size must match** your engine's actual `kv_block_size`
-
-3. **`parent_hash` is required** for all blocks except the first in a sequence - it links blocks to enable prefix matching
-
-## See Also
-
- **[Router README](../components/router/README.md)**: Quick start guide for the KV Router
- **[Router Guide](../components/router/router_guide.md)**: Configuration, tuning, and production setup
- **[Router Design](../design_docs/router_design.md)**: Architecture details and event transport modes
--- a/docs/integrations/lmcache_integration.md
+++ b/docs/integrations/lmcache_integration.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# LMCache Integration in Dynamo
-
-## Introduction
-
-LMCache is a high-performance KV cache layer that supercharges LLM serving by enabling **prefill-once, reuse-everywhere** semantics. As described in the [official documentation](https://docs.lmcache.ai/index.html), LMCache lets LLMs prefill each text only once by storing the KV caches of all reusable texts, allowing reuse of KV caches for any reused text (not necessarily prefix) across any serving engine instance.
-
-This document describes how LMCache is integrated into Dynamo's vLLM backend to provide enhanced performance and memory efficiency.
-
-## Platform Support
-
-**Important Note**: LMCache integration currently only supports x86 architecture. ARM64 is not supported at this time.
-
-## Aggregated Serving
-
-### Configuration
-
-LMCache is enabled using the `--connector lmcache` flag:
-
-```bash
-python -m dynamo.vllm --model <model_name> --connector lmcache
-```
-
-### Customization
-
-LMCache configuration can be customized via environment variables listed [here](https://docs.lmcache.ai/api_reference/configurations.html).
-
-For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html):
-
- **CPU RAM**: Fast local memory offloading
- **Local Storage**: Disk-based persistence
- **Redis**: Distributed cache sharing
- **GDS Backend**: GPU Direct Storage for high throughput
- **InfiniStore/Mooncake**: Cloud-native storage solutions
-
-### Deployment
-
-Use the provided launch script for quick setup:
-
-```bash
-./examples/backends/vllm/launch/agg_lmcache.sh
-```
-
-This will:
-1. Start the Dynamo frontend
-2. Launch a single vLLM worker with LMCache enabled
-
-### Architecture for Aggregated Mode
-
-In aggregated mode, the system uses:
-
- **KV Connector**: `LMCacheConnectorV1`
- **KV Role**: `kv_both` (handles both reading and writing)
-
-## Disaggregated Serving
-
-Disaggregated serving separates prefill and decode operations into dedicated workers. This provides better resource utilization and scalability for production deployments.
-
-### Deployment
-
-Use the provided disaggregated launch script (requires at least 2 GPUs):
-
-```bash
-./examples/backends/vllm/launch/disagg_lmcache.sh
-```
-
-This will:
-1. Start the Dynamo frontend
-2. Launch a decode worker on GPU 0
-3. Wait for initialization
-4. Launch a prefill worker on GPU 1 with LMCache enabled
-
-### Worker Roles
-
-#### Decode Worker
-
- **Purpose**: Handles token generation (decode phase)
- **GPU Assignment**: CUDA_VISIBLE_DEVICES=0
- **LMCache Config**: Uses `NixlConnector` only for KV transfer between prefill and decode workers
-
-#### Prefill Worker
-
- **Purpose**: Handles prompt processing (prefill phase)
- **GPU Assignment**: CUDA_VISIBLE_DEVICES=1
- **LMCache Config**: Uses `MultiConnector` with both LMCache and NIXL connectors. This enables prefill worker to use LMCache for KV offloading and use NIXL for KV transfer between prefill and decode workers.
- **Flag**: `--is-prefill-worker`
-
-## Architecture
-
-### KV Transfer Configuration
-
-The system automatically configures KV transfer based on the deployment mode and worker type:
-
-#### Prefill Worker (Disaggregated Mode)
-
-```python
-kv_transfer_config = KVTransferConfig(
-    kv_connector="PdConnector",
-    kv_role="kv_both",
-    kv_connector_extra_config={
-        "connectors": [
-            {"kv_connector": "LMCacheConnectorV1", "kv_role": "kv_both"},
-            {"kv_connector": "NixlConnector", "kv_role": "kv_both"}
-        ]
-    }
-)
-```
-
-#### Decode Worker or Aggregated Mode
-
-```python
-kv_transfer_config = KVTransferConfig(
-    kv_connector="LMCacheConnectorV1",
-    kv_role="kv_both"
-)
-```
-
-#### Fallback (No LMCache)
-
-```python
-kv_transfer_config = KVTransferConfig(
-    kv_connector="NixlConnector",
-    kv_role="kv_both"
-)
-```
-
-### Integration Points
-
-1. **Argument Parsing** (`args.py`):
-   - Configures appropriate KV transfer settings
-   - Sets up connector configurations based on worker type
-
-2. **Engine Setup** (`main.py`):
-   - Initializes LMCache environment variables
-   - Creates vLLM engine with proper KV transfer config
-   - Handles both aggregated and disaggregated modes
-
-### Best Practices
-
-1. **Chunk Size Tuning**: Adjust `LMCACHE_CHUNK_SIZE` based on your use case:
-   - Smaller chunks (128-256): Better reuse granularity for varied content
-   - Larger chunks (512-1024): More efficient for repetitive content patterns
-
-2. **Memory Allocation**: Set `LMCACHE_MAX_LOCAL_CPU_SIZE` conservatively:
-   - Leave sufficient RAM for other system processes
-   - Monitor memory usage during peak loads
-
-3. **Workload Optimization**: LMCache performs best with:
-   - Repeated prompt patterns (RAG, multi-turn conversations)
-   - Shared context across sessions
-   - Long-running services with warm caches
-
-## Metrics and Monitoring
-
-When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
-
-**Requirements to access LMCache metrics:**
-
- `--connector lmcache` - Enables LMCache
- `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint
- `PROMETHEUS_MULTIPROC_DIR` (optional) - If not set, Dynamo manages it internally
-
-For detailed information on LMCache metrics, including the complete list of available metrics and how to access them, see the **[LMCache Metrics section](../backends/vllm/prometheus.md#lmcache-metrics)** in the vLLM Prometheus Metrics Guide.
-
-## Troubleshooting
-
-### LMCache log: `PrometheusLogger instance already created with different metadata`
-
-You may see an error like:
-
-```text
-LMCache ERROR: PrometheusLogger instance already created with different metadata. This should not happen except in test
-```
-
-**Version note**: We reproduced this behavior with **vLLM v0.12.0**. We have not reproduced it with **vLLM v0.11.0**, so it may be specific to (or introduced in) v0.12.0.
-
-This is emitted by LMCache when the LMCache connector is initialized more than once in the same process (for example, once for a `WORKER` role and later for a `SCHEDULER` role). LMCache uses a process-global singleton for its Prometheus logger, so the second initialization can log this warning if its metadata differs.
-
- **Impact**: This is a log-only error; in our testing it does not prevent vLLM/Dynamo from serving requests. If you care about LMCache metric labels, be aware the logger singleton uses the first-seen metadata.
- **Repro without Dynamo** (vLLM v0.12.0):
-
-```bash
-vllm serve Qwen/Qwen3-0.6B \
-  --host 127.0.0.1 --port 18000 \
-  --gpu-memory-utilization 0.24 \
-  --enforce-eager \
-  --no-enable-prefix-caching \
-  --max-num-seqs 2 \
-  --kv-offloading-backend lmcache \
-  --kv-offloading-size 1 \
-  --disable-hybrid-kv-cache-manager
-```
-
- **Mitigation (silence)**: set `LMCACHE_LOG_LEVEL=CRITICAL`.
- **Upstream issue**: [vLLM issue #30996](https://github.com/vllm-project/vllm/issues/30996).
-
-### vLLM log: `Found PROMETHEUS_MULTIPROC_DIR was set by user`
-
-vLLM v1 uses `prometheus_client.multiprocess` and stores intermediate metric values in `PROMETHEUS_MULTIPROC_DIR`.
-
- If you **set `PROMETHEUS_MULTIPROC_DIR` yourself**, vLLM warns that the directory must be wiped between runs to avoid stale/incorrect metrics.
- When running via Dynamo, the vLLM wrapper may set `PROMETHEUS_MULTIPROC_DIR` internally to a temporary directory to avoid vLLM cleanup issues. If you still see the warning, confirm you are not exporting `PROMETHEUS_MULTIPROC_DIR` in your shell or container environment.
-
-## References and Additional Resources
-
- [LMCache Documentation](https://docs.lmcache.ai/index.html) - Comprehensive guide and API reference
- [Configuration Reference](https://docs.lmcache.ai/api_reference/configurations.html) - Detailed configuration options
- [LMCache Observability Guide](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Metrics and monitoring details
-
--- a/docs/integrations/sglang_hicache.md
+++ b/docs/integrations/sglang_hicache.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-->
-
-# Enable SGLang Hierarchical Cache (HiCache)
-
-This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dynamo.
-
-## 1) Start the SGLang worker with HiCache enabled
-
-```bash
-python -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
-  --host 0.0.0.0 --port 8000 \
-  --page-size 64 \
-  --enable-hierarchical-cache \
-  --hicache-ratio 2 \
-  --hicache-write-policy write_through \
-  --hicache-storage-backend nixl \
-  --log-level debug \
-  --skip-tokenizer-init
-```
-
- **--enable-hierarchical-cache**: Enables hierarchical KV cache/offload
- **--hicache-ratio**: The ratio of the size of host KV cache memory pool to the size of device pool. Lower this number if your machine has less CPU memory.
- **--hicache-write-policy**: Write policy (e.g., `write_through` for synchronous host writes)
- **--hicache-storage-backend**: Host storage backend for HiCache (e.g., `nixl`). NIXL selects the concrete store automatically; see [PR #8488](https://github.com/sgl-project/sglang/pull/8488)
-
-
-Then, start the frontend:
-```bash
-python -m dynamo.frontend --http-port 8000
-```
-
-## 2) Send a single request
-
-```bash
-curl localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "Qwen/Qwen3-0.6B",
-    "messages": [
-      {
-        "role": "user",
-        "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
-      }
-    ],
-    "stream": false,
-    "max_tokens": 30
-  }'
-```
-
-## 3) (Optional) Benchmarking
-
-Run the perf script:
-```bash
-bash -x $DYNAMO_ROOT/benchmarks/llm/perf.sh \
-  --model Qwen/Qwen3-0.6B \
-  --tensor-parallelism 1 \
-  --data-parallelism 1 \
-  --concurrency "2,4,8" \
-  --input-sequence-length 2048 \
-  --output-sequence-length 256
-```
--- a/docs/kubernetes/README.md
+++ b/docs/kubernetes/README.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Deploying Dynamo on Kubernetes
-
-High-level guide to Dynamo Kubernetes deployments. Start here, then dive into specific guides.
-
-## Important Terminology
-
-**Kubernetes Namespace**: The K8s namespace where your DynamoGraphDeployment resource is created.
- Used for: Resource isolation, RBAC, organizing deployments
- Example: `dynamo-system`, `team-a-namespace`
-
-**Dynamo Namespace**: The logical namespace used by Dynamo components for [service discovery](/docs/kubernetes/service_discovery.md).
- Used for: Runtime component communication, service discovery
- Specified in: `.spec.services.<ServiceName>.dynamoNamespace` field
- Example: `my-llm`, `production-model`, `dynamo-dev`
-
-These are independent. A single Kubernetes namespace can host multiple Dynamo namespaces, and vice versa.
-
-## Prerequisites
-
-Before you begin, ensure you have the following tools installed:
-
-| Tool | Minimum Version | Installation Guide |
-|------|-----------------|-------------------|
-| **kubectl** | v1.24+ | [Install kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) |
-| **Helm** | v3.0+ | [Install Helm](https://helm.sh/docs/intro/install/) |
-
-Verify your installation:
-```bash
-kubectl version --client  # Should show v1.24+
-helm version              # Should show v3.0+
-```
-
-For detailed installation instructions, see the [Prerequisites section](/docs/kubernetes/installation_guide.md#prerequisites) in the Installation Guide.
-
-## Pre-deployment Checks
-
-Before deploying the platform, run the pre-deployment checks to ensure the cluster is ready:
-
-```bash
-./deploy/pre-deployment/pre-deployment-check.sh
-```
-
-This validates kubectl connectivity, StorageClass configuration, and GPU availability. See [pre-deployment checks](/deploy/pre-deployment/README.md) for more details.
-
-## 1. Install Platform First
-
-```bash
-# 1. Set environment
-export NAMESPACE=dynamo-system
-export RELEASE_VERSION=0.x.x # any version of Dynamo 0.3.2+ listed at https://github.com/ai-dynamo/dynamo/releases
-
-# 2. Install CRDs (skip if on shared cluster where CRDs already exist)
-helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz
-helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default
-
-# 3. Install Platform
-helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
-helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
-```
-
-**For Shared/Multi-Tenant Clusters:**
-
-If your cluster has namespace-restricted Dynamo operators, add this flag to step 3:
-```bash
--set dynamo-operator.namespaceRestriction.enabled=true
-```
-
-For more details or customization options (including multinode deployments), see **[Installation Guide for Dynamo Kubernetes Platform](/docs/kubernetes/installation_guide.md)**.
-
-## 2. Choose Your Backend
-
-Each backend has deployment examples and configuration options:
-
-| Backend      | Aggregated | Aggregated + Router | Disaggregated | Disaggregated + Router | Disaggregated + Planner | Disaggregated Multi-node |
-|--------------|:----------:|:-------------------:|:-------------:|:----------------------:|:-----------------------:|:------------------------:|
-| **[SGLang](/examples/backends/sglang/deploy/README.md)**       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| **[TensorRT-LLM](/examples/backends/trtllm/deploy/README.md)** | ✅ | ✅ | ✅ | ✅ | 🚧 | ✅ |
-| **[vLLM](/examples/backends/vllm/deploy/README.md)**           | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-
-## 3. Deploy Your First Model
-
-```bash
-export NAMESPACE=dynamo-system
-kubectl create namespace ${NAMESPACE}
-
-# to pull model from HF
-export HF_TOKEN=<Token-Here>
-kubectl create secret generic hf-token-secret \
-  --from-literal=HF_TOKEN="$HF_TOKEN" \
-  -n ${NAMESPACE};
-
-# Deploy any example (this uses vLLM with Qwen model using aggregated serving)
-kubectl apply -f examples/backends/vllm/deploy/agg.yaml -n ${NAMESPACE}
-
-# Check status
-kubectl get dynamoGraphDeployment -n ${NAMESPACE}
-
-# Test it
-kubectl port-forward svc/vllm-agg-frontend 8000:8000 -n ${NAMESPACE}
-curl http://localhost:8000/v1/models
-```
-
-For SLA-based autoscaling, see [SLA Planner Guide](/docs/components/planner/planner_guide.md).
-
-## Understanding Dynamo's Custom Resources
-
-Dynamo provides two main Kubernetes Custom Resources for deploying models:
-
-### DynamoGraphDeploymentRequest (DGDR) - Simplified SLA-Driven Configuration
-
-The **recommended approach** for generating optimal configurations. DGDR provides a high-level interface where you specify:
- Model name and backend framework
- SLA targets (latency requirements)
- GPU type (optional)
-
-Dynamo automatically handles profiling and generates an optimized DGD spec in the status. Perfect for:
- SLA-driven configuration generation
- Automated resource optimization
- Users who want simplicity over control
-
-**Note**: DGDR generates a DGD spec which you can then use to deploy.
-
-### DynamoGraphDeployment (DGD) - Direct Configuration
-
-A lower-level interface that defines your complete inference pipeline:
- Model configuration
- Resource allocation (GPUs, memory)
- Scaling policies
- Frontend/backend connections
-
-Use this when you need fine-grained control or have already completed profiling.
-
-Refer to the [API Reference and Documentation](/docs/kubernetes/api_reference.md) for more details.
-
-## 📖 API Reference & Documentation
-
-For detailed technical specifications of Dynamo's Kubernetes resources:
-
- **[API Reference](/docs/kubernetes/api_reference.md)** - Complete CRD field specifications for all Dynamo resources
- **[Create Deployment](/docs/kubernetes/deployment/create_deployment.md)** - Step-by-step deployment creation with DynamoGraphDeployment
- **[Operator Guide](/docs/kubernetes/dynamo_operator.md)** - Dynamo operator configuration and management
-
-### Choosing Your Architecture Pattern
-
-When creating a deployment, select the architecture pattern that best fits your use case:
-
- **Development / Testing** - Use `agg.yaml` as the base configuration
- **Production with Load Balancing** - Use `agg_router.yaml` to enable scalable, load-balanced inference
- **High Performance / Disaggregated** - Use `disagg_router.yaml` for maximum throughput and modular scalability
-
-### Frontend and Worker Components
-
-You can run the Frontend on one machine (e.g., a CPU node) and workers on different machines (GPU nodes). The Frontend serves as a framework-agnostic HTTP entry point that:
-
- Provides OpenAI-compatible `/v1/chat/completions` endpoint
- Auto-discovers backend workers via [service discovery](/docs/kubernetes/service_discovery.md) (Kubernetes-native by default)
- Routes requests and handles load balancing
- Validates and preprocesses requests
-
-### Customizing Your Deployment
-
-Example structure:
-```yaml
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: my-llm
-spec:
-  services:
-    Frontend:
-      dynamoNamespace: my-llm
-      componentType: frontend
-      replicas: 1
-      extraPodSpec:
-        mainContainer:
-          image: your-image
-    VllmDecodeWorker:  # or SGLangDecodeWorker, TrtllmDecodeWorker
-      dynamoNamespace: dynamo-dev
-      componentType: worker
-      replicas: 1
-      envFromSecret: hf-token-secret  # for HuggingFace models
-      resources:
-        limits:
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: your-image
-          command: ["/bin/sh", "-c"]
-          args:
-            - python3 -m dynamo.vllm --model YOUR_MODEL [--your-flags]
-```
-
-Worker command examples per backend:
-```yaml
-# vLLM worker
-args:
-  - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
-
-# SGLang worker
-args:
-  - >-
-    python3 -m dynamo.sglang
-    --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-    --tp 1
-    --trust-remote-code
-
-# TensorRT-LLM worker
-args:
-  - python3 -m dynamo.trtllm
-    --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-    --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-    --extra-engine-args /workspace/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml
-```
-
-Key customization points include:
- **Model Configuration**: Specify model in the args command
- **Resource Allocation**: Configure GPU requirements under `resources.limits`
- **Scaling**: Set `replicas` for number of worker instances
- **Routing Mode**: Enable KV-cache routing by setting `DYN_ROUTER_MODE=kv` in Frontend envs
- **Worker Specialization**: Add `--is-prefill-worker` flag for disaggregated prefill workers
-
-## Additional Resources
-
- **[Examples](/docs/examples/README.md)** - Complete working examples
- **[Create Custom Deployments](/docs/kubernetes/deployment/create_deployment.md)** - Build your own CRDs
- **[Managing Models with DynamoModel](/docs/kubernetes/deployment/dynamomodel-guide.md)** - Deploy LoRA adapters and manage models
- **[Operator Documentation](/docs/kubernetes/dynamo_operator.md)** - How the platform works
- **[Service Discovery](/docs/kubernetes/service_discovery.md)** - Discovery backends and configuration
- **[Helm Charts](/deploy/helm/README.md)** - For advanced users
- **[Checkpointing](/docs/kubernetes/chrek/README.md)** - Fast pod startup with checkpoint/restore
- **[GitOps Deployment with FluxCD](/docs/kubernetes/fluxcd.md)** - For advanced users
- **[Logging](/docs/kubernetes/observability/logging.md)** - For logging setup
- **[Multinode Deployment](/docs/kubernetes/deployment/multinode-deployment.md)** - For multinode deployment
- **[Grove](/docs/kubernetes/grove.md)** - For grove details and custom installation
- **[Monitoring](/docs/kubernetes/observability/metrics.md)** - For monitoring setup
- **[Model Caching with Fluid](/docs/kubernetes/model_caching_with_fluid.md)** - For model caching with Fluid
-
-```{toctree}
-:hidden:
-
-Detailed Installation Guide <installation_guide>
-Dynamo Operator <dynamo_operator>
-Service Discovery <service_discovery>
-Webhooks <webhooks>
-Minikube Setup <deployment/minikube>
-Managing Models with DynamoModel <deployment/dynamomodel-guide>
-Autoscaling <autoscaling>
-Checkpointing <chrek/README>
-```
--- a/docs/kubernetes/api_reference.md
+++ b/docs/kubernetes/api_reference.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-> **⚠️ Important**: This documentation is automatically generated from source code.
-> Do not edit this file directly.
-
-# API Reference
-
-## Packages
- [nvidia.com/v1alpha1](#nvidiacomv1alpha1)
-
-
-## nvidia.com/v1alpha1
-
-Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
-
-This package defines the DynamoGraphDeploymentRequest (DGDR) custom resource, which provides
-a high-level, SLA-driven interface for deploying machine learning models on Dynamo.
-
-Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
-
-### Resource Types
- [DynamoCheckpoint](#dynamocheckpoint)
- [DynamoComponentDeployment](#dynamocomponentdeployment)
- [DynamoGraphDeployment](#dynamographdeployment)
- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
- [DynamoModel](#dynamomodel)
-
-
-
-#### Autoscaling
-
-
-
-Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
-with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
-for migration guidance. This field will be removed in a future API version.
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `enabled` _boolean_ | Deprecated: This field is ignored. |  |  |
-| `minReplicas` _integer_ | Deprecated: This field is ignored. |  |  |
-| `maxReplicas` _integer_ | Deprecated: This field is ignored. |  |  |
-| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | Deprecated: This field is ignored. |  |  |
-| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | Deprecated: This field is ignored. |  |  |
-
-
-
-
-#### CheckpointMode
-
-_Underlying type:_ _string_
-
-CheckpointMode defines how checkpoint creation is handled
-
-_Validation:_
- Enum: [Auto Manual]
-
-_Appears in:_
- [ServiceCheckpointConfig](#servicecheckpointconfig)
-
-| Field | Description |
-| --- | --- |
-| `Auto` | CheckpointModeAuto means the DGD controller will automatically create a Checkpoint CR<br /> |
-| `Manual` | CheckpointModeManual means the user must create the Checkpoint CR themselves<br /> |
-
-
-#### ComponentKind
-
-_Underlying type:_ _string_
-
-ComponentKind represents the type of underlying Kubernetes resource.
-
-_Validation:_
- Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet]
-
-_Appears in:_
- [ServiceReplicaStatus](#servicereplicastatus)
-
-| Field | Description |
-| --- | --- |
-| `PodClique` | ComponentKindPodClique represents a PodClique resource.<br /> |
-| `PodCliqueScalingGroup` | ComponentKindPodCliqueScalingGroup represents a PodCliqueScalingGroup resource.<br /> |
-| `Deployment` | ComponentKindDeployment represents a Deployment resource.<br /> |
-| `LeaderWorkerSet` | ComponentKindLeaderWorkerSet represents a LeaderWorkerSet resource.<br /> |
-
-
-#### ConfigMapKeySelector
-
-
-
-ConfigMapKeySelector selects a specific key from a ConfigMap.
-Used to reference external configuration data stored in ConfigMaps.
-
-
-
-_Appears in:_
- [ProfilingConfigSpec](#profilingconfigspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `name` _string_ | Name of the ConfigMap containing the desired data. |  | Required: \{\} <br /> |
-| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml |  |
-
-
-#### DeploymentOverridesSpec
-
-
-
-DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
-When autoApply is enabled, these overrides are applied to the generated DGD resource.
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. |  | Optional: \{\} <br /> |
-| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. |  | Optional: \{\} <br /> |
-| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. |  | Optional: \{\} <br /> |
-| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. |  | Optional: \{\} <br /> |
-| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.<br />This image is used for both temporary DGDs created during online profiling and the final DGD.<br />If omitted, the image from the base config file (e.g., disagg.yaml) is used.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" |  | Optional: \{\} <br /> |
-
-
-#### DeploymentStatus
-
-
-
-DeploymentStatus tracks the state of an auto-created DynamoGraphDeployment.
-This status is populated when autoApply is enabled and a DGD is created.
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `name` _string_ | Name is the name of the created DynamoGraphDeployment. |  |  |
-| `namespace` _string_ | Namespace is the namespace of the created DynamoGraphDeployment. |  |  |
-| `state` _string_ | State is the current state of the DynamoGraphDeployment.<br />This value is mirrored from the DGD's status.state field. |  |  |
-| `created` _boolean_ | Created indicates whether the DGD has been successfully created.<br />Used to prevent recreation if the DGD is manually deleted by users. |  |  |
-
-
-
-
-#### DynamoCheckpoint
-
-
-
-DynamoCheckpoint is the Schema for the dynamocheckpoints API
-It represents a container checkpoint that can be used to restore pods to a warm state
-
-
-
-
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
-| `kind` _string_ | `DynamoCheckpoint` | | |
-| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
-| `spec` _[DynamoCheckpointSpec](#dynamocheckpointspec)_ |  |  |  |
-| `status` _[DynamoCheckpointStatus](#dynamocheckpointstatus)_ |  |  |  |
-
-
-
-
-#### DynamoCheckpointIdentity
-
-
-
-DynamoCheckpointIdentity defines the inputs that determine checkpoint equivalence
-Two checkpoints with the same identity hash are considered equivalent
-
-
-
-_Appears in:_
- [DynamoCheckpointSpec](#dynamocheckpointspec)
- [ServiceCheckpointConfig](#servicecheckpointconfig)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `model` _string_ | Model is the model identifier (e.g., "meta-llama/Llama-3-70B") |  | Required: \{\} <br /> |
-| `backendFramework` _string_ | BackendFramework is the runtime framework (vllm, sglang, trtllm) |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
-| `dynamoVersion` _string_ | DynamoVersion is the Dynamo platform version (optional)<br />If not specified, version is not included in identity hash<br />This ensures checkpoint compatibility across Dynamo releases |  | Optional: \{\} <br /> |
-| `tensorParallelSize` _integer_ | TensorParallelSize is the tensor parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
-| `pipelineParallelSize` _integer_ | PipelineParallelSize is the pipeline parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
-| `dtype` _string_ | Dtype is the data type (fp16, bf16, fp8, etc.) |  | Optional: \{\} <br /> |
-| `maxModelLen` _integer_ | MaxModelLen is the maximum sequence length |  | Minimum: 1 <br />Optional: \{\} <br /> |
-| `extraParameters` _object (keys:string, values:string)_ | ExtraParameters are additional parameters that affect the checkpoint hash<br />Use for any framework-specific or custom parameters not covered above |  | Optional: \{\} <br /> |
-
-
-#### DynamoCheckpointJobConfig
-
-
-
-DynamoCheckpointJobConfig defines the configuration for the checkpoint creation Job
-
-
-
-_Appears in:_
- [DynamoCheckpointSpec](#dynamocheckpointspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br />This should include the container that runs the workload to be checkpointed |  | Required: \{\} <br /> |
-| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Optional: \{\} <br /> |
-| `backoffLimit` _integer_ | BackoffLimit specifies the number of retries before marking the Job failed | 3 | Optional: \{\} <br /> |
-| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Optional: \{\} <br /> |
-
-
-#### DynamoCheckpointPhase
-
-_Underlying type:_ _string_
-
-DynamoCheckpointPhase represents the current phase of the checkpoint lifecycle
-
-_Validation:_
- Enum: [Pending Creating Ready Failed]
-
-_Appears in:_
- [DynamoCheckpointStatus](#dynamocheckpointstatus)
-
-| Field | Description |
-| --- | --- |
-| `Pending` | DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started<br /> |
-| `Creating` | DynamoCheckpointPhaseCreating indicates the checkpoint Job is running<br /> |
-| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC<br /> |
-| `Failed` | DynamoCheckpointPhaseFailed indicates the checkpoint creation failed<br /> |
-
-
-#### DynamoCheckpointSpec
-
-
-
-DynamoCheckpointSpec defines the desired state of DynamoCheckpoint
-
-
-
-_Appears in:_
- [DynamoCheckpoint](#dynamocheckpoint)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the inputs that determine checkpoint equivalence |  | Required: \{\} <br /> |
-| `job` _[DynamoCheckpointJobConfig](#dynamocheckpointjobconfig)_ | Job defines the configuration for the checkpoint creation Job |  | Required: \{\} <br /> |
-
-
-#### DynamoCheckpointStatus
-
-
-
-DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
-
-
-
-_Appears in:_
- [DynamoCheckpoint](#dynamocheckpoint)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `phase` _[DynamoCheckpointPhase](#dynamocheckpointphase)_ | Phase represents the current phase of the checkpoint lifecycle |  | Enum: [Pending Creating Ready Failed] <br />Optional: \{\} <br /> |
-| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity<br />This hash is used to identify equivalent checkpoints |  | Optional: \{\} <br /> |
-| `location` _string_ | Location is the full URI/path to the checkpoint in the storage backend<br />For PVC: same as TarPath (e.g., /checkpoints/\{hash\}.tar)<br />For S3: s3://bucket/prefix/\{hash\}.tar<br />For OCI: oci://registry/repo:\{hash\} |  | Optional: \{\} <br /> |
-| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | StorageType indicates the storage backend type used for this checkpoint |  | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
-| `jobName` _string_ | JobName is the name of the checkpoint creation Job |  | Optional: \{\} <br /> |
-| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created |  | Optional: \{\} <br /> |
-| `message` _string_ | Message provides additional information about the current state |  | Optional: \{\} <br /> |
-| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represent the latest available observations of the checkpoint's state |  | Optional: \{\} <br /> |
-
-
-#### DynamoCheckpointStorageType
-
-_Underlying type:_ _string_
-
-DynamoCheckpointStorageType defines the supported storage backends for checkpoints
-
-_Validation:_
- Enum: [pvc s3 oci]
-
-_Appears in:_
- [DynamoCheckpointStatus](#dynamocheckpointstatus)
-
-
-
-#### DynamoComponentDeployment
-
-
-
-DynamoComponentDeployment is the Schema for the dynamocomponentdeployments API
-
-
-
-
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
-| `kind` _string_ | `DynamoComponentDeployment` | | |
-| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
-| `spec` _[DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)_ | Spec defines the desired state for this Dynamo component deployment. |  |  |
-
-
-#### DynamoComponentDeploymentSharedSpec
-
-
-
-
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `annotations` _object (keys:string, values:string)_ | Annotations to add to generated Kubernetes resources for this component<br />(such as Pod, Service, and Ingress when applicable). |  |  |
-| `labels` _object (keys:string, values:string)_ | Labels to add to generated Kubernetes resources for this component. |  |  |
-| `serviceName` _string_ | The name of the component |  |  |
-| `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). |  |  |
-| `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). |  |  |
-| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component |  | Optional: \{\} <br /> |
-| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace |  |  |
-| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. |  |  |
-| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. |  |  |
-| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. |  |  |
-| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. |  |  |
-| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. |  |  |
-| `ingress` _[IngressSpec](#ingressspec)_ | Ingress config to expose the component outside the cluster (or through a service mesh). |  |  |
-| `modelRef` _[ModelReference](#modelreference)_ | ModelRef references a model that this component serves<br />When specified, a headless service will be created for endpoint discovery |  | Optional: \{\} <br /> |
-| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size). |  |  |
-| `extraPodMetadata` _[ExtraPodMetadata](#extrapodmetadata)_ | ExtraPodMetadata adds labels/annotations to the created Pods. |  | Optional: \{\} <br /> |
-| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. |  | Optional: \{\} <br /> |
-| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. |  |  |
-| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. |  |  |
-| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled, this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. |  | Minimum: 0 <br /> |
-| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. |  |  |
-| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. |  | Optional: \{\} <br /> |
-| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". |  | Optional: \{\} <br /> |
-| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. |  | Optional: \{\} <br /> |
-
-
-#### DynamoComponentDeploymentSpec
-
-
-
-DynamoComponentDeploymentSpec defines the desired state of DynamoComponentDeployment
-
-
-
-_Appears in:_
- [DynamoComponentDeployment](#dynamocomponentdeployment)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm") |  | Enum: [sglang vllm trtllm] <br /> |
-| `annotations` _object (keys:string, values:string)_ | Annotations to add to generated Kubernetes resources for this component<br />(such as Pod, Service, and Ingress when applicable). |  |  |
-| `labels` _object (keys:string, values:string)_ | Labels to add to generated Kubernetes resources for this component. |  |  |
-| `serviceName` _string_ | The name of the component |  |  |
-| `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). |  |  |
-| `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). |  |  |
-| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component |  | Optional: \{\} <br /> |
-| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace |  |  |
-| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. |  |  |
-| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. |  |  |
-| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. |  |  |
-| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. |  |  |
-| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. |  |  |
-| `ingress` _[IngressSpec](#ingressspec)_ | Ingress config to expose the component outside the cluster (or through a service mesh). |  |  |
-| `modelRef` _[ModelReference](#modelreference)_ | ModelRef references a model that this component serves<br />When specified, a headless service will be created for endpoint discovery |  | Optional: \{\} <br /> |
-| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size). |  |  |
-| `extraPodMetadata` _[ExtraPodMetadata](#extrapodmetadata)_ | ExtraPodMetadata adds labels/annotations to the created Pods. |  | Optional: \{\} <br /> |
-| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. |  | Optional: \{\} <br /> |
-| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. |  |  |
-| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. |  |  |
-| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled, this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. |  | Minimum: 0 <br /> |
-| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. |  |  |
-| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. |  | Optional: \{\} <br /> |
-| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". |  | Optional: \{\} <br /> |
-| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. |  | Optional: \{\} <br /> |
-
-
-#### DynamoGraphDeployment
-
-
-
-DynamoGraphDeployment is the Schema for the dynamographdeployments API.
-
-
-
-
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
-| `kind` _string_ | `DynamoGraphDeployment` | | |
-| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
-| `spec` _[DynamoGraphDeploymentSpec](#dynamographdeploymentspec)_ | Spec defines the desired state for this graph deployment. |  |  |
-| `status` _[DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)_ | Status reflects the current observed state of this graph deployment. |  |  |
-
-
-#### DynamoGraphDeploymentRequest
-
-
-
-DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
-It serves as the primary interface for users to request model deployments with
-specific performance and resource constraints, enabling SLA-driven deployments.
-
-Lifecycle:
- 1. Initial → Pending: Validates spec and prepares for profiling
- 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
- 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
- 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
- 5. Ready: Terminal state when DGD is operational or spec is available
- 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
-
-The spec becomes immutable once profiling starts. Users must delete and recreate
-the DGDR to modify configuration after this point.
-
-
-
-
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
-| `kind` _string_ | `DynamoGraphDeploymentRequest` | | |
-| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
-| `spec` _[DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)_ | Spec defines the desired state for this deployment request. |  |  |
-| `status` _[DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)_ | Status reflects the current observed state of this deployment request. |  |  |
-
-
-#### DynamoGraphDeploymentRequestSpec
-
-
-
-DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
-This CRD serves as the primary interface for users to request model deployments with
-specific performance constraints and resource requirements, enabling SLA-driven deployments.
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. |  | Required: \{\} <br /> |
-| `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
-| `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false |  |
-| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,<br />numGpusPerNode) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> |
-| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. |  | Required: \{\} <br /> |
-| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false |  |
-| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. |  | Optional: \{\} <br /> |
-
-
-#### DynamoGraphDeploymentRequestStatus
-
-
-
-DynamoGraphDeploymentRequestStatus represents the observed state of a DynamoGraphDeploymentRequest.
-The controller updates this status as the DGDR progresses through its lifecycle.
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. |  |  |
-| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. |  | Optional: \{\} <br /> |
-| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. |  |  |
-| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. |  |  |
-| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" |  | Optional: \{\} <br /> |
-| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata.<br />For mocker backends, this contains the mocker DGD spec. |  | EmbeddedResource: \{\} <br />Optional: \{\} <br /> |
-| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. |  | Optional: \{\} <br /> |
-
-
-#### DynamoGraphDeploymentScalingAdapter
-
-
-
-DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services
-within a DynamoGraphDeployment. It implements the Kubernetes scale
-subresource, enabling integration with HPA, KEDA, and custom autoscalers.
-
-The adapter acts as an intermediary between autoscalers and the DGD,
-ensuring that only the adapter controller modifies the DGD's service replicas.
-This prevents conflicts when multiple autoscaling mechanisms are in play.
-
-
-
-
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
-| `kind` _string_ | `DynamoGraphDeploymentScalingAdapter` | | |
-| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
-| `spec` _[DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)_ |  |  |  |
-| `status` _[DynamoGraphDeploymentScalingAdapterStatus](#dynamographdeploymentscalingadapterstatus)_ |  |  |  |
-
-
-#### DynamoGraphDeploymentScalingAdapterSpec
-
-
-
-DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.<br />This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. |  | Minimum: 0 <br />Required: \{\} <br /> |
-| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. |  | Required: \{\} <br /> |
-
-
-#### DynamoGraphDeploymentScalingAdapterStatus
-
-
-
-DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `replicas` _integer_ | Replicas is the current number of replicas for the target service.<br />This is synced from the DGD's service replicas and is required for the scale subresource. |  | Optional: \{\} <br /> |
-| `selector` _string_ | Selector is a label selector string for the pods managed by this adapter.<br />Required for HPA compatibility via the scale subresource. |  | Optional: \{\} <br /> |
-| `lastScaleTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | LastScaleTime is the last time the adapter scaled the target service. |  | Optional: \{\} <br /> |
-
-
-#### DynamoGraphDeploymentServiceRef
-
-
-
-DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `name` _string_ | Name of the DynamoGraphDeployment |  | MinLength: 1 <br />Required: \{\} <br /> |
-| `serviceName` _string_ | ServiceName is the key name of the service within the DGD's spec.services map to scale |  | MinLength: 1 <br />Required: \{\} <br /> |
-
-
-#### DynamoGraphDeploymentSpec
-
-
-
-DynamoGraphDeploymentSpec defines the desired state of DynamoGraphDeployment.
-
-
-
-_Appears in:_
- [DynamoGraphDeployment](#dynamographdeployment)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. |  | MaxItems: 100 <br />Optional: \{\} <br /> |
-| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. |  | MaxProperties: 25 <br />Optional: \{\} <br /> |
-| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. |  | Optional: \{\} <br /> |
-| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). |  | Enum: [sglang vllm trtllm] <br /> |
-| `restart` _[Restart](#restart)_ | Restart specifies the restart policy for the graph deployment. |  | Optional: \{\} <br /> |
-
-
-#### DynamoGraphDeploymentStatus
-
-
-
-DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment.
-
-
-
-_Appears in:_
- [DynamoGraphDeployment](#dynamographdeployment)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `state` _string_ | State is a high-level textual status of the graph deployment lifecycle. |  |  |
-| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.<br />The slice is merged by type on patch updates. |  |  |
-| `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.<br />The map key is the service name from spec.services. |  | Optional: \{\} <br /> |
-| `restart` _[RestartStatus](#restartstatus)_ | Restart contains the status of the restart of the graph deployment. |  | Optional: \{\} <br /> |
-| `checkpoints` _object (keys:string, values:[ServiceCheckpointStatus](#servicecheckpointstatus))_ | Checkpoints contains per-service checkpoint status information.<br />The map key is the service name from spec.services. |  | Optional: \{\} <br /> |
-
-
-#### DynamoModel
-
-
-
-DynamoModel is the Schema for the dynamo models API
-
-
-
-
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
-| `kind` _string_ | `DynamoModel` | | |
-| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
-| `spec` _[DynamoModelSpec](#dynamomodelspec)_ |  |  |  |
-| `status` _[DynamoModelStatus](#dynamomodelstatus)_ |  |  |  |
-
-
-#### DynamoModelSpec
-
-
-
-DynamoModelSpec defines the desired state of DynamoModel
-
-
-
-_Appears in:_
- [DynamoModel](#dynamomodel)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `modelName` _string_ | ModelName is the full model identifier (e.g., "meta-llama/Llama-3.3-70B-Instruct-lora") |  | Required: \{\} <br /> |
-| `baseModelName` _string_ | BaseModelName is the base model identifier that matches the service label<br />This is used to discover endpoints via headless services |  | Required: \{\} <br /> |
-| `modelType` _string_ | ModelType specifies the type of model (e.g., "base", "lora", "adapter") | base | Enum: [base lora adapter] <br />Optional: \{\} <br /> |
-| `source` _[ModelSource](#modelsource)_ | Source specifies the model source location (only applicable for lora model type) |  | Optional: \{\} <br /> |
-
-
-#### DynamoModelStatus
-
-
-
-DynamoModelStatus defines the observed state of DynamoModel
-
-
-
-_Appears in:_
- [DynamoModel](#dynamomodel)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `endpoints` _[EndpointInfo](#endpointinfo) array_ | Endpoints is the current list of all endpoints for this model |  | Optional: \{\} <br /> |
-| `readyEndpoints` _integer_ | ReadyEndpoints is the count of endpoints that are ready |  |  |
-| `totalEndpoints` _integer_ | TotalEndpoints is the total count of endpoints |  |  |
-| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represents the latest available observations of the model's state |  | Optional: \{\} <br /> |
-
-
-#### EPPConfig
-
-
-
-EPPConfig contains configuration for EPP (Endpoint Picker Plugin) components.
-EPP is responsible for intelligent endpoint selection and KV-aware routing.
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `configMapRef` _[ConfigMapKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#configmapkeyselector-v1-core)_ | ConfigMapRef references a user-provided ConfigMap containing EPP configuration.<br />The ConfigMap should contain EndpointPickerConfig YAML.<br />Mutually exclusive with Config. |  | Optional: \{\} <br /> |
-| `config` _[EndpointPickerConfig](#endpointpickerconfig)_ | Config allows specifying EPP EndpointPickerConfig directly as a structured object.<br />The operator will marshal this to YAML and create a ConfigMap automatically.<br />Mutually exclusive with ConfigMapRef.<br />One of ConfigMapRef or Config must be specified (no default configuration).<br />Uses the upstream type from github.com/kubernetes-sigs/gateway-api-inference-extension |  | Type: object <br />Optional: \{\} <br /> |
-
-
-#### EndpointInfo
-
-
-
-EndpointInfo represents a single endpoint (pod) serving the model
-
-
-
-_Appears in:_
- [DynamoModelStatus](#dynamomodelstatus)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `address` _string_ | Address is the full address of the endpoint (e.g., "http://10.0.1.5:9090") |  |  |
-| `podName` _string_ | PodName is the name of the pod serving this endpoint |  | Optional: \{\} <br /> |
-| `ready` _boolean_ | Ready indicates whether the endpoint is ready to serve traffic<br />For LoRA models: true if the POST /loras request succeeded with a 2xx status code<br />For base models: always false (no probing performed) |  |  |
-
-
-#### ExtraPodMetadata
-
-
-
-
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `annotations` _object (keys:string, values:string)_ |  |  |  |
-| `labels` _object (keys:string, values:string)_ |  |  |  |
-
-
-#### ExtraPodSpec
-
-
-
-
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `mainContainer` _[Container](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#container-v1-core)_ |  |  |  |
-
-
-#### IngressSpec
-
-
-
-
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `enabled` _boolean_ | Enabled exposes the component through an ingress or virtual service when true. |  |  |
-| `host` _string_ | Host is the base host name to route external traffic to this component. |  |  |
-| `useVirtualService` _boolean_ | UseVirtualService indicates whether to configure a service-mesh VirtualService instead of a standard Ingress. |  |  |
-| `virtualServiceGateway` _string_ | VirtualServiceGateway optionally specifies the gateway name to attach the VirtualService to. |  |  |
-| `hostPrefix` _string_ | HostPrefix is an optional prefix added before the host. |  |  |
-| `annotations` _object (keys:string, values:string)_ | Annotations to set on the generated Ingress/VirtualService resources. |  |  |
-| `labels` _object (keys:string, values:string)_ | Labels to set on the generated Ingress/VirtualService resources. |  |  |
-| `tls` _[IngressTLSSpec](#ingresstlsspec)_ | TLS holds the TLS configuration used by the Ingress/VirtualService. |  |  |
-| `hostSuffix` _string_ | HostSuffix is an optional suffix appended after the host. |  |  |
-| `ingressControllerClassName` _string_ | IngressControllerClassName selects the ingress controller class (e.g., "nginx"). |  |  |
-
-
-#### IngressTLSSpec
-
-
-
-
-
-
-
-_Appears in:_
- [IngressSpec](#ingressspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `secretName` _string_ | SecretName is the name of a Kubernetes Secret containing the TLS certificate and key. |  |  |
-
-
-
-
-#### ModelReference
-
-
-
-ModelReference identifies a model served by this component
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `name` _string_ | Name is the base model identifier (e.g., "llama-3-70b-instruct-v1") |  | Required: \{\} <br /> |
-| `revision` _string_ | Revision is the model revision/version (optional) |  | Optional: \{\} <br /> |
-
-
-#### ModelSource
-
-
-
-ModelSource defines the source location of a model
-
-
-
-_Appears in:_
- [DynamoModelSpec](#dynamomodelspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `uri` _string_ | URI is the model source URI<br />Supported formats:<br />- S3: s3://bucket/path/to/model<br />- HuggingFace: hf://org/model@revision_sha |  | Required: \{\} <br /> |
-
-
-#### MultinodeSpec
-
-
-
-
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `nodeCount` _integer_ | Indicates the number of nodes to deploy for multinode components.<br />Total number of GPUs is NumberOfNodes * GPU limit.<br />Must be greater than 1. | 2 | Minimum: 2 <br /> |
-
-
-#### PVC
-
-
-
-
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `create` _boolean_ | Create indicates to create a new PVC |  |  |
-| `name` _string_ | Name is the name of the PVC |  | Required: \{\} <br /> |
-| `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. |  |  |
-| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. |  |  |
-| `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. |  |  |
-
-
-#### ProfilingConfigSpec
-
-
-
-ProfilingConfigSpec defines configuration for the profiling process.
-This structure maps directly to the profile_sla.py config format.
-See benchmarks/profiler/utils/profiler_argparse.py for the complete schema.
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. |  | Optional: \{\} <br />Type: object <br /> |
-| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. |  | Optional: \{\} <br /> |
-| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.<br />This image contains the profiler code and dependencies needed for SLA-based profiling.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" |  | Required: \{\} <br /> |
-| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.<br />If specified, all profiling artifacts (logs, plots, configs, raw data) will be written<br />to this PVC instead of an ephemeral emptyDir volume. This allows users to access<br />complete profiling results after the job completes by mounting the PVC.<br />The PVC must exist in the same namespace as the DGDR.<br />If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.<br />Note: ConfigMaps are still created regardless of this setting for planner integration. |  | Optional: \{\} <br /> |
-| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.<br />If not specified, no resource requests or limits are set. |  | Optional: \{\} <br /> |
-| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.<br />For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. |  | Optional: \{\} <br /> |
-
-
-#### ResourceItem
-
-
-
-
-
-
-
-_Appears in:_
- [Resources](#resources)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `cpu` _string_ | CPU specifies the CPU resource request/limit (e.g., "1000m", "2") |  |  |
-| `memory` _string_ | Memory specifies the memory resource request/limit (e.g., "4Gi", "8Gi") |  |  |
-| `gpu` _string_ | GPU indicates the number of GPUs to request.<br />Total number of GPUs is NumberOfNodes * GPU in case of multinode deployment. |  |  |
-| `gpuType` _string_ | GPUType can specify a custom GPU type, e.g. "gpu.intel.com/xe"<br />By default if not specified, the GPU type is "nvidia.com/gpu" |  |  |
-| `custom` _object (keys:string, values:string)_ | Custom specifies additional custom resource requests/limits |  |  |
-
-
-#### Resources
-
-
-
-Resources defines requested and limits for a component, including CPU, memory,
-GPUs/devices, and any runtime-specific resources.
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `requests` _[ResourceItem](#resourceitem)_ | Requests specifies the minimum resources required by the component |  |  |
-| `limits` _[ResourceItem](#resourceitem)_ | Limits specifies the maximum resources allowed for the component |  |  |
-| `claims` _[ResourceClaim](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourceclaim-v1-core) array_ | Claims specifies resource claims for dynamic resource allocation |  |  |
-
-
-#### Restart
-
-
-
-
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `id` _string_ | ID is an arbitrary string that triggers a restart when changed.<br />Any modification to this value will initiate a restart of the graph deployment according to the strategy. |  | MinLength: 1 <br />Required: \{\} <br /> |
-| `strategy` _[RestartStrategy](#restartstrategy)_ | Strategy specifies the restart strategy for the graph deployment. |  | Optional: \{\} <br /> |
-
-
-#### RestartPhase
-
-_Underlying type:_ _string_
-
-
-
-
-
-_Appears in:_
- [RestartStatus](#restartstatus)
-
-| Field | Description |
-| --- | --- |
-| `Pending` |  |
-| `Restarting` |  |
-| `Completed` |  |
-| `Failed` |  |
-
-
-#### RestartStatus
-
-
-
-RestartStatus contains the status of the restart of the graph deployment.
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `observedID` _string_ | ObservedID is the restart ID that has been observed and is being processed.<br />Matches the Restart.ID field in the spec. |  |  |
-| `phase` _[RestartPhase](#restartphase)_ | Phase is the phase of the restart. |  |  |
-| `inProgress` _string array_ | InProgress contains the names of the services that are currently being restarted. |  | Optional: \{\} <br /> |
-
-
-#### RestartStrategy
-
-
-
-
-
-
-
-_Appears in:_
- [Restart](#restart)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `type` _[RestartStrategyType](#restartstrategytype)_ | Type specifies the restart strategy type. | Sequential | Enum: [Sequential Parallel] <br /> |
-| `order` _string array_ | Order specifies the order in which the services should be restarted. |  | Optional: \{\} <br /> |
-
-
-#### RestartStrategyType
-
-_Underlying type:_ _string_
-
-
-
-
-
-_Appears in:_
- [RestartStrategy](#restartstrategy)
-
-| Field | Description |
-| --- | --- |
-| `Sequential` |  |
-| `Parallel` |  |
-
-
-#### ScalingAdapter
-
-
-
-ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
-for replica management. When enabled, the DGDSA owns the replicas field and
-external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `enabled` _boolean_ | Enabled indicates whether the ScalingAdapter should be enabled for this service.<br />When true, a DGDSA is created and owns the replicas field.<br />When false (default), no DGDSA is created and replicas can be modified directly in the DGD. | false | Optional: \{\} <br /> |
-
-
-#### ServiceCheckpointConfig
-
-
-
-ServiceCheckpointConfig configures checkpointing for a DGD service
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\} <br /> |
-| `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br />- Auto: DGD controller creates Checkpoint CR automatically<br />- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br />Optional: \{\} <br /> |
-| `checkpointRef` _string_ | CheckpointRef references an existing Checkpoint CR to use<br />If specified, Identity is ignored and this checkpoint is used directly |  | Optional: \{\} <br /> |
-| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br />Used when Mode is Auto or when looking up existing checkpoints<br />Required when checkpointRef is not specified |  | Optional: \{\} <br /> |
-
-
-#### ServiceCheckpointStatus
-
-
-
-ServiceCheckpointStatus contains checkpoint information for a single service.
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `checkpointName` _string_ | CheckpointName is the name of the associated Checkpoint CR |  | Optional: \{\} <br /> |
-| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity |  | Optional: \{\} <br /> |
-| `ready` _boolean_ | Ready indicates if the checkpoint is ready for use |  | Optional: \{\} <br /> |
-
-
-#### ServiceReplicaStatus
-
-
-
-ServiceReplicaStatus contains replica information for a single service.
-
-
-
-_Appears in:_
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `componentKind` _[ComponentKind](#componentkind)_ | ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet"). |  | Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet] <br /> |
-| `componentName` _string_ | ComponentName is the name of the underlying resource. |  |  |
-| `replicas` _integer_ | Replicas is the total number of non-terminated replicas.<br />Required for all component kinds. |  | Minimum: 0 <br /> |
-| `updatedReplicas` _integer_ | UpdatedReplicas is the number of replicas at the current/desired revision.<br />Required for all component kinds. |  | Minimum: 0 <br /> |
-| `readyReplicas` _integer_ | ReadyReplicas is the number of ready replicas.<br />Populated for PodClique, Deployment, and LeaderWorkerSet.<br />Not available for PodCliqueScalingGroup.<br />When nil, the field is omitted from the API response. |  | Minimum: 0 <br />Optional: \{\} <br /> |
-| `availableReplicas` _integer_ | AvailableReplicas is the number of available replicas.<br />For Deployment: replicas ready for >= minReadySeconds.<br />For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.<br />Not available for PodClique or LeaderWorkerSet.<br />When nil, the field is omitted from the API response. |  | Minimum: 0 <br />Optional: \{\} <br /> |
-
-
-#### SharedMemorySpec
-
-
-
-
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `disabled` _boolean_ |  |  |  |
-| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ |  |  |  |
-
-
-#### VolumeMount
-
-
-
-VolumeMount references a PVC defined at the top level for volumes to be mounted by the component
-
-
-
-_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `name` _string_ | Name references a PVC name defined in the top-level PVCs map |  | Required: \{\} <br /> |
-| `mountPoint` _string_ | MountPoint specifies where to mount the volume.<br />If useAsCompilationCache is true and mountPoint is not specified,<br />a backend-specific default will be used. |  |  |
-| `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.<br />When true, backend-specific environment variables will be set and default mount points may be used. | false |  |
-
-
-# Operator Default Values Injection
-
-The Dynamo operator automatically applies default values to various fields when they are not explicitly specified in your deployments. These defaults include:
-
- **Health Probes**: Startup, liveness, and readiness probes are configured differently for frontend, worker, and planner components. For example, worker components receive a startup probe with a 2-hour timeout (720 failures × 10 seconds) to accommodate long model loading times.
-
- **Security Context**: All components receive `fsGroup: 1000` by default to ensure proper file permissions for mounted volumes. This can be overridden via the `extraPodSpec.securityContext` field.
-
- **Shared Memory**: All components receive an 8Gi shared memory volume mounted at `/dev/shm` by default (can be disabled or resized via the `sharedMemory` field).
-
- **Environment Variables**: Components automatically receive environment variables like `DYN_NAMESPACE`, `DYN_PARENT_DGD_K8S_NAME`, `DYNAMO_PORT`, and backend-specific variables.
-
- **Pod Configuration**: Default `terminationGracePeriodSeconds` of 60 seconds and `restartPolicy: Always`.
-
- **Autoscaling**: When enabled without explicit metrics, defaults to CPU-based autoscaling with 80% target utilization.
-
- **Backend-Specific Behavior**: For multinode deployments, probes are automatically modified or removed for worker nodes depending on the backend framework (VLLM, SGLang, or TensorRT-LLM).
-
-## Pod Specification Defaults
-
-All components receive the following pod-level defaults unless overridden:
-
- **`terminationGracePeriodSeconds`**: `60` seconds
- **`restartPolicy`**: `Always`
-
-## Security Context
-
-The operator automatically applies default security context settings to all components to ensure proper file permissions, particularly for mounted volumes:
-
- **`fsGroup`**: `1000` - Sets the group ownership of mounted volumes and any files created in those volumes
-
-This default ensures that non-root containers can write to mounted volumes (like model caches or persistent storage) without permission issues. The `fsGroup` setting is particularly important for:
- Model downloads and caching
- Compilation cache directories
- Persistent volume claims (PVCs)
- SSH key generation in multinode deployments
-
-### Overriding Security Context
-
-To override the default security context, specify your own `securityContext` in the `extraPodSpec` of your component:
-
-```yaml
-services:
-  YourWorker:
-    extraPodSpec:
-      securityContext:
-        fsGroup: 2000  # Custom group ID
-        runAsUser: 1000
-        runAsGroup: 1000
-        runAsNonRoot: true
-```
-
-**Important**: When you provide *any* `securityContext` object in `extraPodSpec`, the operator will not inject any defaults. This gives you complete control over the security context, including the ability to run as root (by omitting `runAsNonRoot` or setting it to `false`).
-
-### OpenShift and Security Context Constraints
-
-In OpenShift environments with Security Context Constraints (SCCs), you may need to omit explicit UID/GID values to allow OpenShift's admission controllers to assign them dynamically:
-
-```yaml
-services:
-  YourWorker:
-    extraPodSpec:
-      securityContext:
-        # Omit fsGroup to let OpenShift assign it based on SCC
-        # OpenShift will inject the appropriate UID range
-```
-
-Alternatively, if you want to keep the default `fsGroup: 1000` behavior and are certain your cluster allows it, you don't need to specify anything - the operator defaults will work.
-
-## Shared Memory Configuration
-
-Shared memory is enabled by default for all components:
-
- **Enabled**: `true` (unless explicitly disabled via `sharedMemory.disabled`)
- **Size**: `8Gi`
- **Mount Path**: `/dev/shm`
- **Volume Type**: `emptyDir` with `memory` medium
-
-To disable shared memory or customize the size, use the `sharedMemory` field in your component specification.
-
-## Health Probes by Component Type
-
-The operator applies different default health probes based on the component type.
-
-### Frontend Components
-
-Frontend components receive the following probe configurations:
-
-**Liveness Probe:**
- **Type**: HTTP GET
- **Path**: `/health`
- **Port**: `http` (8000)
- **Initial Delay**: 60 seconds
- **Period**: 60 seconds
- **Timeout**: 30 seconds
- **Failure Threshold**: 10
-
-**Readiness Probe:**
- **Type**: Exec command
- **Command**: `curl -s http://localhost:${DYNAMO_PORT}/health | jq -e ".status == \"healthy\""`
- **Initial Delay**: 60 seconds
- **Period**: 60 seconds
- **Timeout**: 30 seconds
- **Failure Threshold**: 10
-
-### Worker Components
-
-Worker components receive the following probe configurations:
-
-**Liveness Probe:**
- **Type**: HTTP GET
- **Path**: `/live`
- **Port**: `system` (9090)
- **Period**: 5 seconds
- **Timeout**: 30 seconds
- **Failure Threshold**: 1
-
-**Readiness Probe:**
- **Type**: HTTP GET
- **Path**: `/health`
- **Port**: `system` (9090)
- **Period**: 10 seconds
- **Timeout**: 30 seconds
- **Failure Threshold**: 60
-
-**Startup Probe:**
- **Type**: HTTP GET
- **Path**: `/live`
- **Port**: `system` (9090)
- **Period**: 10 seconds
- **Timeout**: 5 seconds
- **Failure Threshold**: 720 (allows up to 2 hours for startup: 10s × 720 = 7200s)
-
-:::{note}
-For larger models (typically >70B parameters) or slower storage systems, you may need to increase the `failureThreshold` to allow more time for model loading. Calculate the required threshold based on your expected startup time: `failureThreshold = (expected_startup_seconds / period)`. Override the startup probe in your component specification if the default 2-hour window is insufficient.
-:::
-
-### Multinode Deployment Probe Modifications
-
-For multinode deployments, the operator modifies probes based on the backend framework and node role:
-
-#### VLLM Backend
-
-The operator automatically selects between two deployment modes based on parallelism configuration:
-
-**Tensor/Pipeline Parallel Mode** (when `world_size > GPUs_per_node`):
- Uses Ray for distributed execution (`--distributed-executor-backend ray`)
- **Leader nodes**: Starts Ray head and runs vLLM; all probes remain active
- **Worker nodes**: Run Ray agents only; all probes (liveness, readiness, startup) are removed
-
-**Data Parallel Mode** (when `world_size × data_parallel_size > GPUs_per_node`):
- **Worker nodes**: All probes (liveness, readiness, startup) are removed
- **Leader nodes**: All probes remain active
-
-#### SGLang Backend
- **Worker nodes**: All probes (liveness, readiness, startup) are removed
-
-#### TensorRT-LLM Backend
- **Leader nodes**: All probes remain unchanged
- **Worker nodes**:
-  - Liveness and startup probes are removed
-  - Readiness probe is replaced with a TCP socket check on SSH port (2222):
-    - **Initial Delay**: 20 seconds
-    - **Period**: 20 seconds
-    - **Timeout**: 5 seconds
-    - **Failure Threshold**: 10
-
-## Environment Variables
-
-The operator automatically injects environment variables based on component type and configuration:
-
-### All Components
-
- **`DYN_NAMESPACE`**: The Dynamo namespace for the component
- **`DYN_PARENT_DGD_K8S_NAME`**: The parent DynamoGraphDeployment Kubernetes resource name
- **`DYN_PARENT_DGD_K8S_NAMESPACE`**: The parent DynamoGraphDeployment Kubernetes namespace
-
-### Frontend Components
-
- **`DYNAMO_PORT`**: `8000`
- **`DYN_HTTP_PORT`**: `8000`
-
-### Worker Components
-
- **`DYN_SYSTEM_PORT`**: `9090` (automatically enables the system metrics server)
- **`DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS`**: `["generate"]`
- **`DYN_SYSTEM_ENABLED`**: `true` (needed for runtime images 0.6.1 and older)
-
-### Planner Components
-
- **`PLANNER_PROMETHEUS_PORT`**: `9085`
-
-### VLLM Backend (with compilation cache)
-
-When a volume mount is configured with `useAsCompilationCache: true`:
- **`VLLM_CACHE_ROOT`**: Set to the mount point of the cache volume
-
-## Service Account
-
-Planner components automatically receive the following service account:
-
- **`serviceAccountName`**: `planner-serviceaccount`
-
-## Image Pull Secrets
-
-The operator automatically discovers and injects image pull secrets for container images. When a component specifies a container image, the operator:
-
-1. Scans all Kubernetes secrets of type `kubernetes.io/dockerconfigjson` in the component's namespace
-2. Extracts the docker registry server URLs from each secret's authentication configuration
-3. Matches the container image's registry host against the discovered registry URLs
-4. Automatically injects matching secrets as `imagePullSecrets` in the pod specification
-
-This eliminates the need to manually specify image pull secrets for each component. The operator maintains an internal index of docker secrets and their associated registries, refreshing this index periodically.
-
-**To disable automatic image pull secret discovery** for a specific component, add the following annotation:
-
-```yaml
-annotations:
-  nvidia.com/disable-image-pull-secret-discovery: "true"
-```
-
-## Autoscaling Defaults
-
-When autoscaling is enabled but no metrics are specified, the operator applies:
-
- **Default Metric**: CPU utilization
- **Target Average Utilization**: `80%`
-
-## Port Configurations
-
-Default container ports are configured based on component type:
-
-### Frontend Components
- **Port**: 8000
- **Protocol**: TCP
- **Name**: `http`
-
-### Worker Components
- **Port**: 9090
- **Protocol**: TCP
- **Name**: `system`
-
-### Planner Components
- **Port**: 9085
- **Protocol**: TCP
- **Name**: `metrics`
-
-## Backend-Specific Configurations
-
-### VLLM
- **Ray Head Port**: 6379 (for Ray cluster coordination in multinode TP/PP deployments)
- **Data Parallel RPC Port**: 13445 (for data parallel multinode deployments)
-
-### SGLang
- **Distribution Init Port**: 29500 (for multinode deployments)
-
-### TensorRT-LLM
- **SSH Port**: 2222 (for multinode MPI communication)
- **OpenMPI Environment**: `OMPI_MCA_orte_keep_fqdn_hostnames=1`
-
-## Implementation Reference
-
-For users who want to understand the implementation details or contribute to the operator, the default values described in this document are set in the following source files:
-
- **Health Probes, Security Context & Pod Specifications**: [`internal/dynamo/graph.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/graph.go) - Contains the main logic for applying default probes, security context, environment variables, shared memory, and pod configurations
- **Component-Specific Defaults**:
-  - [`internal/dynamo/component_frontend.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_frontend.go)
-  - [`internal/dynamo/component_worker.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_worker.go)
-  - [`internal/dynamo/component_planner.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_planner.go)
- **Image Pull Secrets**: [`internal/secrets/docker.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/secrets/docker.go) - Implements the docker secret indexer and automatic discovery
- **Backend-Specific Behavior**:
-  - [`internal/dynamo/backend_vllm.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_vllm.go)
-  - [`internal/dynamo/backend_sglang.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_sglang.go)
-  - [`internal/dynamo/backend_trtllm.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_trtllm.go)
- **Constants & Annotations**: [`internal/consts/consts.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/consts/consts.go) - Defines annotation keys and other constants
-
-## Notes
-
- All these defaults can be overridden by explicitly specifying values in your DynamoComponentDeployment or DynamoGraphDeployment resources
- User-specified probes (via `livenessProbe`, `readinessProbe`, or `startupProbe` fields) take precedence over operator defaults
- For security context, if you provide *any* `securityContext` in `extraPodSpec`, no defaults will be injected, giving you full control
- For multinode deployments, some defaults are modified or removed as described above to accommodate distributed execution patterns
- The `extraPodSpec.mainContainer` field can be used to override probe configurations set by the operator
--- a/docs/kubernetes/autoscaling.md
+++ b/docs/kubernetes/autoscaling.md
-# Autoscaling
-
-This guide explains how to configure autoscaling for DynamoGraphDeployment (DGD) services using the `sglang-agg` example from `examples/backends/sglang/deploy/agg.yaml`.
-
-## Example DGD
-
-All examples in this guide use the following DGD:
-
-```yaml
-# examples/backends/sglang/deploy/agg.yaml
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: sglang-agg
-  namespace: default
-spec:
-  services:
-    Frontend:
-      dynamoNamespace: sglang-agg
-      componentType: frontend
-      replicas: 1
-
-    decode:
-      dynamoNamespace: sglang-agg
-      componentType: worker
-      replicas: 1
-      resources:
-        limits:
-          gpu: "1"
-```
-
-**Key identifiers:**
- **DGD name**: `sglang-agg`
- **Namespace**: `default`
- **Services**: `Frontend`, `decode`
- **dynamo_namespace label**: `default-sglang-agg` (used for metric filtering)
-
-## Overview
-
-Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAdapter` (DGDSA) resource. When you deploy a DGD, the operator automatically creates one adapter per service (unless explicitly disabled). These adapters implement the Kubernetes [Scale subresource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), enabling integration with:
-
-| Autoscaler | Description | Best For |
-|------------|-------------|----------|
-| **KEDA** | Event-driven autoscaling (recommended) | Most use cases |
-| **Kubernetes HPA** | Native horizontal pod autoscaling | Simple CPU/memory-based scaling |
-| **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads |
-| **Custom Controllers** | Any scale-subresource-compatible controller | Custom requirements |
-
-> **⚠️ Deprecation Notice**: The `spec.services[X].autoscaling` field in DGD is **deprecated and ignored**. Use DGDSA with HPA, KEDA, or Planner instead. If you have existing DGDs with `autoscaling` configured, you'll see a warning. Remove the field to silence the warning.
-
-## Architecture
-
-```
-┌──────────────────────────────────┐          ┌─────────────────────────────────────┐
-│   DynamoGraphDeployment          │          │   Scaling Adapters (auto-created)   │
-│   "sglang-agg"                   │          │   (one per service)                 │
-├──────────────────────────────────┤          ├─────────────────────────────────────┤
-│                                  │          │                                     │
-│  spec.services:                  │          │  ┌─────────────────────────────┐    │      ┌──────────────────┐
-│                                  │          │  │ sglang-agg-frontend         │◄───┼──────│   Autoscalers    │
-│    ┌────────────────────────┐◄───┼──────────┼──│ spec.replicas: 1            │    │      │                  │
-│    │ Frontend: 1 replica    │    │          │  └─────────────────────────────┘    │      │  • KEDA          │
-│    └────────────────────────┘    │          │                                     │      │  • HPA           │
-│                                  │          │  ┌─────────────────────────────┐    │      │  • Planner       │
-│    ┌────────────────────────┐◄───┼──────────┼──│ sglang-agg-decode           │◄───┼──────│  • Custom        │
-│    │ decode:   1 replica    │    │          │  │ spec.replicas: 1            │    │      │                  │
-│    └────────────────────────┘    │          │  └─────────────────────────────┘    │      └──────────────────┘
-│                                  │          │                                     │
-└──────────────────────────────────┘          └─────────────────────────────────────┘
-```
-
-**How it works:**
-
-1. You deploy a DGD with services (Frontend, decode)
-2. The operator auto-creates one DGDSA per service
-3. Autoscalers (KEDA, HPA, Planner) target the adapters via `/scale` subresource
-4. Adapter controller syncs replica changes to the DGD
-5. DGD controller reconciles the underlying pods
-
-## Viewing Scaling Adapters
-
-After deploying the `sglang-agg` DGD, verify the auto-created adapters:
-
-```bash
-kubectl get dgdsa -n default
-
-# Example output:
-# NAME                  DGD         SERVICE    REPLICAS   AGE
-# sglang-agg-frontend   sglang-agg  Frontend   1          5m
-# sglang-agg-decode     sglang-agg  decode     1          5m
-```
-
-## Replica Ownership Model
-
-When DGDSA is enabled (the default), it becomes the **source of truth** for replica counts. This follows the same pattern as Kubernetes Deployments owning ReplicaSets.
-
-### How It Works
-
-1. **DGDSA owns replicas**: Autoscalers (HPA, KEDA, Planner) update the DGDSA's `spec.replicas`
-2. **DGDSA syncs to DGD**: The DGDSA controller writes the replica count to the DGD's service
-3. **Direct DGD edits blocked**: A validating webhook prevents users from directly editing `spec.services[X].replicas` in the DGD
-4. **Controllers allowed**: Only authorized controllers (operator, Planner) can modify DGD replicas
-
-### Manual Scaling with DGDSA Enabled
-
-When DGDSA is enabled, use `kubectl scale` on the adapter (not the DGD):
-
-```bash
-# ✅ Correct - scale via DGDSA
-kubectl scale dgdsa sglang-agg-decode --replicas=3
-
-# ❌ Blocked - direct DGD edit rejected by webhook
-kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}'
-# Error: spec.services[decode].replicas cannot be modified directly when scaling adapter is enabled;
-#        use 'kubectl scale dgdsa/sglang-agg-decode --replicas=3' or update the DynamoGraphDeploymentScalingAdapter instead
-```
-
-## Enabling DGDSA for a Service
-
-By default, no DGDSA is created for services, allowing direct replica management via the DGD. To enable autoscaling via HPA, KEDA, or Planner, explicitly enable the scaling adapter:
-
-```yaml
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: sglang-agg
-spec:
-  services:
-    Frontend:
-      replicas: 2        # ← No DGDSA by default, direct edits allowed
-
-    decode:
-      replicas: 1
-      scalingAdapter:
-        enabled: true    # ← DGDSA created, managed via adapter
-```
-
-**When to enable DGDSA:**
- You want to use HPA, KEDA, or Planner for autoscaling
- You want a clear separation between "desired scale" (adapter) and "deployment config" (DGD)
- You want protection against accidental direct replica edits
-
-**When to keep DGDSA disabled (default):**
- You want simple, manual replica management
- You don't need autoscaling for that service
- You prefer direct DGD edits over adapter-based scaling
-
-## Autoscaling with Dynamo Planner
-
-The Dynamo Planner is an LLM-aware autoscaler that optimizes scaling decisions based on inference-specific metrics like Time To First Token (TTFT), Inter-Token Latency (ITL), and KV cache utilization.
-
-**When to use Planner:**
- You want LLM-optimized autoscaling out of the box
- You need coordinated scaling across prefill/decode services
- You want SLA-driven scaling (e.g., target TTFT < 500ms)
-
-**How Planner works:**
-
-Planner is deployed as a service component within your DGD. It:
-1. Queries Prometheus for frontend metrics (request rate, latency, etc.)
-2. Uses profiling data to predict optimal replica counts
-3. Scales prefill/decode workers to meet SLA targets
-
-**Deployment:**
-
-The recommended way to deploy Planner is via `DynamoGraphDeploymentRequest` (DGDR). See the [SLA Planner Quick Start](../components/planner/planner_guide.md) for complete instructions.
-
-Example configurations with Planner:
- `examples/backends/vllm/deploy/disagg_planner.yaml`
- `examples/backends/sglang/deploy/disagg_planner.yaml`
- `examples/backends/trtllm/deploy/disagg_planner.yaml`
-
-For more details, see the [SLA Planner documentation](../components/planner/planner_guide.md).
-
-## Autoscaling with Kubernetes HPA
-
-The Horizontal Pod Autoscaler (HPA) is Kubernetes' native autoscaling solution.
-
-**When to use HPA:**
- You have simple, predictable scaling requirements
- You want to use standard Kubernetes tooling
- You need CPU or memory-based scaling
-
-> **Note**: For custom metrics (like TTFT or queue depth), consider using [KEDA](#autoscaling-with-keda-recommended) instead - it's simpler to configure.
-
-### Basic HPA (CPU-based)
-
-```yaml
-apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: sglang-agg-frontend-hpa
-  namespace: default
-spec:
-  scaleTargetRef:
-    apiVersion: nvidia.com/v1alpha1
-    kind: DynamoGraphDeploymentScalingAdapter
-    name: sglang-agg-frontend
-  minReplicas: 1
-  maxReplicas: 10
-  metrics:
-  - type: Resource
-    resource:
-      name: cpu
-      target:
-        type: Utilization
-        averageUtilization: 70
-  behavior:
-    scaleDown:
-      stabilizationWindowSeconds: 300
-    scaleUp:
-      stabilizationWindowSeconds: 0
-```
-
-### HPA with Dynamo Metrics
-
-Dynamo exports several metrics useful for autoscaling. These are available at the `/metrics` endpoint on each frontend pod.
-
-> **See also**: For a complete list of all Dynamo metrics, see the [Metrics Reference](../observability/metrics.md). For Prometheus and Grafana setup, see the [Prometheus and Grafana Setup Guide](../observability/prometheus-grafana.md).
-
-#### Available Dynamo Metrics
-
-| Metric | Type | Description | Good for scaling |
-|--------|------|-------------|------------------|
-| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in HTTP queue | ✅ Workers |
-| `dynamo_frontend_inflight_requests` | Gauge | Concurrent requests to engine | ✅ All services |
-| `dynamo_frontend_time_to_first_token_seconds` | Histogram | TTFT latency | ✅ Workers |
-| `dynamo_frontend_inter_token_latency_seconds` | Histogram | ITL latency | ✅ Decode |
-| `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | ⚠️ General |
-
-#### Metric Labels
-
-Dynamo metrics include these labels for filtering:
-
-| Label | Description | Example |
-|-------|-------------|---------|
-| `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dynamoNamespace}`) | `default-sglang-agg` |
-| `model` | Model being served | `Qwen/Qwen3-0.6B` |
-
-> **Note**: When you have multiple DGDs in the same namespace, use `dynamo_namespace` to filter metrics for a specific DGD.
-
-#### Example: Scale Decode Service Based on TTFT
-
-Using HPA with Prometheus Adapter requires configuring external metrics.
-
-**Step 1: Configure Prometheus Adapter**
-
-Add this to your Helm values file (e.g., `prometheus-adapter-values.yaml`):
-
-```yaml
-# prometheus-adapter-values.yaml
-prometheus:
-  url: http://prometheus-kube-prometheus-prometheus.monitoring.svc
-  port: 9090
-
-rules:
-  external:
-  # TTFT p95 from frontend - used to scale decode
-  - seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}'
-    resources:
-      overrides:
-        namespace: {resource: "namespace"}
-    name:
-      as: "dynamo_ttft_p95_seconds"
-    metricsQuery: |
-      histogram_quantile(0.95,
-        sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m]))
-        by (le, namespace, dynamo_namespace)
-      )
-```
-
-**Step 2: Install Prometheus Adapter**
-
-```bash
-helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
-helm repo update
-
-helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapter \
-  -n monitoring --create-namespace \
-  -f prometheus-adapter-values.yaml
-```
-
-**Step 3: Verify the metric is available**
-
-```bash
-kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1/namespaces/<your-namespace>/dynamo_ttft_p95_seconds" | jq
-```
-
-**Step 4: Create the HPA**
-
-```yaml
-apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: sglang-agg-decode-hpa
-spec:
-  scaleTargetRef:
-    apiVersion: nvidia.com/v1alpha1
-    kind: DynamoGraphDeploymentScalingAdapter
-    name: sglang-agg-decode              # ← DGD name + service name (lowercase)
-  minReplicas: 1
-  maxReplicas: 10
-  metrics:
-  - type: External
-    external:
-      metric:
-        name: dynamo_ttft_p95_seconds
-        selector:
-          matchLabels:
-            dynamo_namespace: "default-sglang-agg"  # ← {namespace}-{dynamoNamespace}
-      target:
-        type: Value
-        value: "500m"  # Scale up when TTFT p95 > 500ms
-  behavior:
-    scaleDown:
-      stabilizationWindowSeconds: 60    # Wait 1 min before scaling down
-      policies:
-      - type: Pods
-        value: 1
-        periodSeconds: 30
-    scaleUp:
-      stabilizationWindowSeconds: 0      # Scale up immediately
-      policies:
-      - type: Pods
-        value: 2
-        periodSeconds: 30
-```
-
-**How it works:**
-1. Frontend pods export `dynamo_frontend_time_to_first_token_seconds` histogram
-2. Prometheus Adapter calculates p95 TTFT per `dynamo_namespace`
-3. HPA monitors this metric filtered by `dynamo_namespace: "default-sglang-agg"`
-4. When TTFT p95 > 500ms, HPA scales up the `sglang-agg-decode` adapter
-5. Adapter controller syncs the replica count to the DGD's `decode` service
-6. More decode workers are created, reducing TTFT
-
-#### Example: Scale Based on Queue Depth
-
-Add this rule to your `prometheus-adapter-values.yaml` (alongside the TTFT rule):
-
-```yaml
-# Add to rules.external in prometheus-adapter-values.yaml
- seriesQuery: 'dynamo_frontend_queued_requests{namespace!=""}'
-  resources:
-    overrides:
-      namespace: {resource: "namespace"}
-  name:
-    as: "dynamo_queued_requests"
-  metricsQuery: |
-    sum(<<.Series>>{<<.LabelMatchers>>}) by (namespace, dynamo_namespace)
-```
-
-Then create the HPA:
-
-```yaml
-apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: sglang-agg-decode-queue-hpa
-  namespace: default
-spec:
-  scaleTargetRef:
-    apiVersion: nvidia.com/v1alpha1
-    kind: DynamoGraphDeploymentScalingAdapter
-    name: sglang-agg-decode
-  minReplicas: 1
-  maxReplicas: 10
-  metrics:
-  - type: External
-    external:
-      metric:
-        name: dynamo_queued_requests
-        selector:
-          matchLabels:
-            dynamo_namespace: "default-sglang-agg"
-      target:
-        type: Value
-        value: "10"  # Scale up when queue > 10 requests
-```
-
-## Autoscaling with KEDA (Recommended)
-
-KEDA (Kubernetes Event-driven Autoscaling) extends Kubernetes with event-driven autoscaling, supporting 50+ scalers including Prometheus.
-
-**Advantages over HPA + Prometheus Adapter:**
- No Prometheus Adapter configuration needed
- PromQL queries are defined in the ScaledObject itself (declarative, per-deployment)
- Easy to update - just `kubectl apply` the ScaledObject
- Can scale to zero when idle
- Supports multiple triggers per object
-
-**When to use KEDA:**
- You want simpler configuration (no Prometheus Adapter to manage)
- You need event-driven scaling (e.g., queue depth, Kafka, etc.)
- You want to scale to zero when idle
-
-### Installing KEDA
-
-```bash
-# Add KEDA Helm repo
-helm repo add kedacore https://kedacore.github.io/charts
-helm repo update
-
-# Install KEDA
-helm install keda kedacore/keda \
-  --namespace keda \
-  --create-namespace
-
-# Verify installation
-kubectl get pods -n keda
-```
-
-> **Note**: If you have Prometheus Adapter installed, either uninstall it first (`helm uninstall prometheus-adapter -n monitoring`) or install KEDA with `--set metricsServer.enabled=false` to avoid API conflicts.
-
-### Example: Scale Decode Based on TTFT
-
-Using the `sglang-agg` DGD from `examples/backends/sglang/deploy/agg.yaml`:
-
-```yaml
-apiVersion: keda.sh/v1alpha1
-kind: ScaledObject
-metadata:
-  name: sglang-agg-decode-scaler
-  namespace: default
-spec:
-  scaleTargetRef:
-    apiVersion: nvidia.com/v1alpha1
-    kind: DynamoGraphDeploymentScalingAdapter
-    name: sglang-agg-decode
-  minReplicaCount: 1
-  maxReplicaCount: 10
-  pollingInterval: 15      # Check metrics every 15 seconds
-  cooldownPeriod: 60       # Wait 60s before scaling down
-  triggers:
-  - type: prometheus
-    metadata:
-      # Update this URL to match your Prometheus service
-      serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
-      metricName: dynamo_ttft_p95
-      query: |
-        histogram_quantile(0.95,
-          sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m]))
-          by (le)
-        )
-      threshold: "0.5"              # Scale up when TTFT p95 > 500ms (0.5 seconds)
-      activationThreshold: "0.1"    # Start scaling when TTFT > 100ms
-```
-
-Apply it:
-
-```bash
-kubectl apply -f sglang-agg-decode-scaler.yaml
-```
-
-### Verify KEDA Scaling
-
-```bash
-# Check ScaledObject status
-kubectl get scaledobject -n default
-
-# KEDA creates an HPA under the hood - you can see it
-kubectl get hpa -n default
-
-# Example output:
-# NAME                                REFERENCE                                              TARGETS      MINPODS   MAXPODS   REPLICAS
-# keda-hpa-sglang-agg-decode-scaler   DynamoGraphDeploymentScalingAdapter/sglang-agg-decode  45m/500m     1         10        1
-
-# Get detailed status
-kubectl describe scaledobject sglang-agg-decode-scaler -n default
-```
-
-### Example: Scale Based on Queue Depth
-
-```yaml
-apiVersion: keda.sh/v1alpha1
-kind: ScaledObject
-metadata:
-  name: sglang-agg-decode-queue-scaler
-  namespace: default
-spec:
-  scaleTargetRef:
-    apiVersion: nvidia.com/v1alpha1
-    kind: DynamoGraphDeploymentScalingAdapter
-    name: sglang-agg-decode
-  minReplicaCount: 1
-  maxReplicaCount: 10
-  pollingInterval: 15
-  cooldownPeriod: 60
-  triggers:
-  - type: prometheus
-    metadata:
-      serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
-      metricName: dynamo_queued_requests
-      query: |
-        sum(dynamo_frontend_queued_requests{dynamo_namespace="default-sglang-agg"})
-      threshold: "10"    # Scale up when queue > 10 requests
-```
-
-### How KEDA Works
-
-KEDA creates and manages an HPA under the hood:
-
-```
-┌──────────────────────────────────────────────────────────────────────┐
-│  You create: ScaledObject                                            │
-│    - scaleTargetRef: sglang-agg-decode                               │
-│    - triggers: prometheus query                                      │
-└──────────────────────────────────────────────────────────────────────┘
-                                │
-                                ▼
-┌──────────────────────────────────────────────────────────────────────┐
-│  KEDA Operator automatically creates: HPA                            │
-│    - name: keda-hpa-sglang-agg-decode-scaler                         │
-│    - scaleTargetRef: sglang-agg-decode                               │
-│    - metrics: External (from KEDA metrics server)                    │
-└──────────────────────────────────────────────────────────────────────┘
-                                │
-                                ▼
-┌──────────────────────────────────────────────────────────────────────┐
-│  DynamoGraphDeploymentScalingAdapter: sglang-agg-decode              │
-│    - spec.replicas: updated by HPA                                   │
-└──────────────────────────────────────────────────────────────────────┘
-                                │
-                                ▼
-┌──────────────────────────────────────────────────────────────────────┐
-│  DynamoGraphDeployment: sglang-agg                                   │
-│    - spec.services.decode.replicas: synced from adapter              │
-└──────────────────────────────────────────────────────────────────────┘
-```
-
-## Mixed Autoscaling
-
-For disaggregated deployments (prefill + decode), you can use different autoscaling strategies for different services:
-
-```yaml
---
-# HPA for Frontend (CPU-based)
-apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: sglang-agg-frontend-hpa
-  namespace: default
-spec:
-  scaleTargetRef:
-    apiVersion: nvidia.com/v1alpha1
-    kind: DynamoGraphDeploymentScalingAdapter
-    name: sglang-agg-frontend
-  minReplicas: 1
-  maxReplicas: 5
-  metrics:
-  - type: Resource
-    resource:
-      name: cpu
-      target:
-        type: Utilization
-        averageUtilization: 70
-
---
-# KEDA for Decode (TTFT-based)
-apiVersion: keda.sh/v1alpha1
-kind: ScaledObject
-metadata:
-  name: sglang-agg-decode-scaler
-  namespace: default
-spec:
-  scaleTargetRef:
-    apiVersion: nvidia.com/v1alpha1
-    kind: DynamoGraphDeploymentScalingAdapter
-    name: sglang-agg-decode
-  minReplicaCount: 1
-  maxReplicaCount: 10
-  triggers:
-  - type: prometheus
-    metadata:
-      serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
-      query: |
-        histogram_quantile(0.95,
-          sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m]))
-          by (le)
-        )
-      threshold: "0.5"
-```
-
-## Manual Scaling
-
-### With DGDSA Enabled (Default)
-
-When DGDSA is enabled (the default), scale via the adapter:
-
-```bash
-kubectl scale dgdsa sglang-agg-decode -n default --replicas=3
-```
-
-Verify the scaling:
-
-```bash
-kubectl get dgdsa sglang-agg-decode -n default
-
-# Output:
-# NAME                DGD         SERVICE   REPLICAS   AGE
-# sglang-agg-decode   sglang-agg  decode    3          10m
-```
-
-> **Note**: If an autoscaler (KEDA, HPA, Planner) is managing the adapter, your change will be overwritten on the next evaluation cycle.
-
-### With DGDSA Disabled
-
-If you've disabled the scaling adapter for a service, edit the DGD directly:
-
-```bash
-kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}'
-```
-
-Or edit the YAML (no `scalingAdapter.enabled: true` means direct edits are allowed):
-
-```yaml
-spec:
-  services:
-    decode:
-      replicas: 3
-      # No scalingAdapter.enabled means replicas can be edited directly
-```
-
-## Best Practices
-
-### 1. Choose One Autoscaler Per Service
-
-Avoid configuring multiple autoscalers for the same service:
-
-| Configuration | Status |
-|---------------|--------|
-| HPA for frontend, Planner for prefill/decode | ✅ Good |
-| KEDA for all services | ✅ Good |
-| Planner only (default) | ✅ Good |
-| HPA + Planner both targeting decode | ❌ Bad - they will fight |
-
-### 2. Use Appropriate Metrics
-
-| Service Type | Recommended Metrics | Dynamo Metric |
-|--------------|---------------------|---------------|
-| Frontend | CPU utilization, request rate | `dynamo_frontend_requests_total` |
-| Prefill | Queue depth, TTFT | `dynamo_frontend_queued_requests`, `dynamo_frontend_time_to_first_token_seconds` |
-| Decode | ITL | `dynamo_frontend_inter_token_latency_seconds` |
-
-### 3. Configure Stabilization Windows
-
-Prevent thrashing with appropriate stabilization:
-
-```yaml
-# HPA
-behavior:
-  scaleDown:
-    stabilizationWindowSeconds: 300  # Wait 5 min before scaling down
-  scaleUp:
-    stabilizationWindowSeconds: 0    # Scale up immediately
-
-# KEDA
-spec:
-  cooldownPeriod: 300
-```
-
-### 4. Set Sensible Min/Max Replicas
-
-Always configure minimum and maximum replicas in your HPA/KEDA to prevent:
- Scaling to zero (unless intentional)
- Unbounded scaling that exhausts cluster resources
-
-## Troubleshooting
-
-### Adapters Not Created
-
-```bash
-# Check DGD status
-kubectl describe dgd sglang-agg -n default
-
-# Check operator logs
-kubectl logs -n dynamo-system deployment/dynamo-operator
-```
-
-### Scaling Not Working
-
-```bash
-# Check adapter status
-kubectl describe dgdsa sglang-agg-decode -n default
-
-# Check HPA/KEDA status
-kubectl describe hpa sglang-agg-decode-hpa -n default
-kubectl describe scaledobject sglang-agg-decode-scaler -n default
-
-# Verify metrics are available in Kubernetes metrics API
-kubectl get --raw /apis/external.metrics.k8s.io/v1beta1
-```
-
-### Metrics Not Available
-
-If HPA/KEDA shows `<unknown>` for metrics:
-
-```bash
-# Check if Dynamo metrics are being scraped
-kubectl port-forward -n default svc/sglang-agg-frontend 8000:8000
-curl http://localhost:8000/metrics | grep dynamo_frontend
-
-# Example output:
-# dynamo_frontend_queued_requests{model="Qwen/Qwen3-0.6B"} 2
-# dynamo_frontend_inflight_requests{model="Qwen/Qwen3-0.6B"} 5
-
-# Verify Prometheus is scraping the metrics
-kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
-# Then query: dynamo_frontend_time_to_first_token_seconds_bucket
-
-# Check KEDA operator logs
-kubectl logs -n keda deployment/keda-operator
-```
-
-### Rapid Scaling Up and Down
-
-If you see unstable scaling:
-
-1. Check if multiple autoscalers are targeting the same adapter
-2. Increase `cooldownPeriod` in KEDA ScaledObject
-3. Increase `stabilizationWindowSeconds` in HPA behavior
-
-## References
-
- [Kubernetes HPA Documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
- [KEDA Documentation](https://keda.sh/)
- [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter)
- [Planner Documentation](../components/planner/planner_guide.md)
- [Dynamo Metrics Reference](../observability/metrics.md)
- [Prometheus and Grafana Setup](../observability/prometheus-grafana.md)
-
--- a/docs/kubernetes/chrek/README.md
+++ b/docs/kubernetes/chrek/README.md
-# ChReK: Checkpoint/Restore in Kubernetes
-
-> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Limitations](#limitations) for details.
-
-**ChReK** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). ChReK dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
-
-## What is ChReK?
-
-ChReK provides:
- **Fast cold starts**: Restore GPU-accelerated applications in seconds instead of minutes
- **CUDA state preservation**: Checkpoint and restore GPU memory and CUDA contexts
- **Kubernetes-native**: Integrates seamlessly with Kubernetes primitives
- **Storage flexibility**: PVC-based storage (S3/OCI planned for future releases)
- **Namespace isolation**: Each namespace gets its own checkpoint infrastructure
-
-## Use Cases
-
-### 1. With NVIDIA Dynamo Platform (Recommended)
-
-Use ChReK as part of the Dynamo platform for automatic checkpoint management:
- Automatic checkpoint creation and lifecycle management
- Seamless integration with DynamoGraphDeployment CRDs
- Built-in autoscaling with fast restore
-
-📖 **[Read the Dynamo Integration Guide →](dynamo.md)**
-
-### 2. Standalone (Without Dynamo)
-
-Use ChReK independently in your own Kubernetes applications:
- Manual checkpoint job creation
- Build your own restore-enabled container images
- Full control over checkpoint lifecycle
-
-📖 **[Read the Standalone Usage Guide →](standalone.md)**
-
-## Architecture
-
-ChReK consists of two main components:
-
-### 1. ChReK Helm Chart
-Deploys the checkpoint/restore infrastructure:
- **DaemonSet**: Runs on GPU nodes to perform CRIU checkpoint operations
- **PVC**: Stores checkpoint data (rootfs diffs, CUDA memory state)
- **RBAC**: Namespace-scoped or cluster-wide permissions
- **Seccomp Profile**: Security policies for CRIU syscalls
-
-### 2. Smart Entrypoint
-A wrapper script that intelligently decides between:
- **Cold start**: Normal application startup (when no checkpoint exists)
- **Restore**: CRIU restore from checkpoint (when checkpoint available)
-
-## Quick Start
-
-### Install ChReK Infrastructure
-
-```bash
-helm install chrek nvidia/chrek \
-  --namespace my-team \
-  --create-namespace \
-  --set storage.pvc.size=100Gi
-```
-
-### Choose Your Integration Path
-
- **Using Dynamo Platform?** → Follow the [Dynamo Integration Guide](dynamo.md)
- **Using standalone?** → Follow the [Standalone Usage Guide](standalone.md)
-
-## Key Features
-
-### ✅ Currently Supported
- ✅ **vLLM backend only** (SGLang and TensorRT-LLM planned)
- ✅ Single-node, single-GPU checkpoints
- ✅ PVC storage backend (RWX for multi-node)
- ✅ CUDA checkpoint/restore
- ✅ PyTorch distributed state (with `GLOO_SOCKET_IFNAME=lo`)
- ✅ Namespace-scoped and cluster-wide RBAC
- ✅ Idempotent checkpoint creation
- ✅ Automatic signal-based checkpoint coordination
-
-### 🚧 Planned Features
- 🚧 SGLang backend support
- 🚧 TensorRT-LLM backend support
- 🚧 S3/MinIO storage backend
- 🚧 OCI registry storage backend
- 🚧 Multi-GPU checkpoints
- 🚧 Multi-node distributed checkpoints
-
-## Limitations
-
-⚠️ **Important**: ChReK has significant limitations that may impact production readiness:
-
-### Security Considerations
- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function. This grants containers elevated host access and may violate security policies in many production environments.
- **Security Impact**: Privileged containers can:
-  - Access all host devices
-  - Bypass most security restrictions
-  - Potentially compromise node security if the container is exploited
-
-### Technical Limitations
- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
- **Single-node only**: Checkpoints must be created and restored on the same node
- **Single-GPU only**: Multi-GPU configurations not yet supported
- **Network state limitations**: Active TCP connections are closed during restore (use `tcp-close` CRIU option)
- **Storage**: Only PVC storage is currently implemented (S3/OCI planned)
-
-### Recommendation
-ChReK is best suited for:
- ✅ Development and testing environments
- ✅ Research and experimentation
- ✅ Controlled production environments with appropriate security controls
- ❌ Security-sensitive production workloads without proper risk assessment
-
-## Documentation
-
-### Getting Started
- [Dynamo Integration Guide](dynamo.md) - Using ChReK with Dynamo Platform
- [Standalone Usage Guide](standalone.md) - Using ChReK independently
- [ChReK Helm Chart README](../../../deploy/helm/charts/chrek/README.md) - Helm chart configuration
-
-### Related Documentation
- [CRIU Documentation](https://criu.org/Main_Page) - Upstream CRIU docs
-
-## Prerequisites
-
- Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- CRIU support in container runtime (containerd with CRIU plugin)
- RWX storage class (for multi-node deployments)
- **Security clearance for privileged pods** (required for restore operations)
-
-## Troubleshooting
-
-### Common Issues
-
-**DaemonSet not starting?**
- Check GPU node labels: `kubectl get nodes -l nvidia.com/gpu.present=true`
- Verify NVIDIA runtime is available
-
-**Checkpoint fails?**
- Check DaemonSet logs: `kubectl logs -l app.kubernetes.io/name=chrek -n <namespace>`
- Ensure application properly signals readiness
- Verify CRIU is installed in the runtime
-
-**Restore fails?**
- Ensure restore pod uses the same volumes as checkpoint job
- Verify `hostIPC: true` is set (required for CUDA)
- Check for `PSM3_DISABLED=1` and `GLOO_SOCKET_IFNAME=lo` environment variables
-
-For detailed troubleshooting, see:
- [Dynamo Integration Guide - Troubleshooting](dynamo.md#troubleshooting)
- [Standalone Guide - Troubleshooting](standalone.md#troubleshooting)
-
-## Contributing
-
-ChReK is part of the NVIDIA Dynamo project. Contributions are welcome!
-
-## License
-
-Apache License 2.0
--- a/docs/kubernetes/chrek/dynamo.md
+++ b/docs/kubernetes/chrek/dynamo.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Checkpoint/Restore for Fast Pod Startup
-
-> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations. See [Limitations](#limitations) for details.
-
-Reduce cold start times for LLM inference workers from ~3 minutes to ~30 seconds using container checkpointing.
-
-## Overview
-
-Checkpointing captures the complete state of a running worker pod (including GPU memory) and saves it to storage. New pods can restore from this checkpoint instead of performing a full cold start.
-
-| Startup Type | Time | What Happens |
-|--------------|------|--------------|
-| **Cold Start** | ~3 min | Download model, load to GPU, initialize engine |
-| **Warm Start** (checkpoint) | ~30 sec | Restore from checkpoint tar |
-
-## Prerequisites
-
- Dynamo Platform installed (v0.4.0+)
- ChReK Helm chart installed (separate from platform)
- GPU nodes with CRIU support
- RWX PVC storage (PVC is currently the only supported backend)
-
-## Quick Start
-
-### 1. Install ChReK Infrastructure
-
-First, install the ChReK Helm chart in each namespace where you need checkpointing:
-
-```bash
-# Install ChReK infrastructure
-helm install chrek nvidia/chrek \
-  --namespace my-team \
-  --create-namespace \
-  --set storage.pvc.size=100Gi
-```
-
-This creates:
- A PVC for checkpoint storage (`chrek-pvc`)
- A DaemonSet for CRIU operations (`chrek-agent`)
-
-### 2. Configure Operator Values
-
-Update your Helm values to point to the ChReK infrastructure:
-
-```yaml
-# values.yaml
-dynamo-operator:
-  checkpoint:
-    enabled: true
-    storage:
-      type: pvc  # Only PVC is currently supported (S3/OCI planned)
-      pvc:
-        pvcName: "chrek-pvc"  # Must match ChReK chart
-        basePath: "/checkpoints"
-      signalHostPath: "/var/lib/chrek/signals"  # Must match ChReK chart
-```
-
-### 2. Configure Your DGD
-
-Add checkpoint configuration to your service:
-
-```yaml
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: my-llm
-spec:
-  services:
-    VllmWorker:
-      replicas: 1
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
-          args:
-            - python3 -m dynamo.vllm --model meta-llama/Llama-3-8B
-      resources:
-        limits:
-          nvidia.com/gpu: "1"
-
-      # Checkpoint configuration
-      checkpoint:
-        enabled: true
-        mode: auto  # Automatically create checkpoint if not found
-        identity:
-          model: "meta-llama/Llama-3-8B"
-          backendFramework: "vllm"
-          tensorParallelSize: 1
-          dtype: "bfloat16"
-```
-
-### 3. Deploy
-
-```bash
-kubectl apply -f my-llm.yaml -n dynamo-system
-```
-
-On first deployment:
-1. A checkpoint job runs to create the checkpoint
-2. Worker pods start with cold start (checkpoint not ready yet)
-3. Once checkpoint is ready, new pods (scale-up, restarts) restore from checkpoint
-
-## Storage Backends
-
-### PVC (Currently Supported)
-
-Use when you have RWX storage available (e.g., NFS, EFS, Filestore).
-
-```yaml
-checkpoint:
-  storage:
-    type: pvc
-    pvc:
-      pvcName: "chrek-pvc"
-      basePath: "/checkpoints"
-```
-
-**Requirements:**
- RWX (ReadWriteMany) PVC for multi-node access
- Sufficient storage (checkpoints are ~10-50GB per model)
-
-### S3 / MinIO (Planned - Not Yet Implemented)
-
-> ⚠️ **Note:** S3 storage backend is defined in the API but not yet fully implemented.
-
-Object storage support is planned for a future release. The configuration will look like:
-
-```yaml
-checkpoint:
-  storage:
-    type: s3  # Not yet supported
-    s3:
-      # AWS S3
-      uri: "s3://my-bucket/checkpoints"
-
-      # Or MinIO / custom S3
-      uri: "s3://minio.example.com/my-bucket/checkpoints"
-
-      # Optional: credentials secret
-      credentialsSecretRef: "s3-creds"
-```
-
-### OCI Registry (Planned - Not Yet Implemented)
-
-> ⚠️ **Note:** OCI registry storage backend is defined in the API but not yet fully implemented.
-
-Container registry storage support is planned for a future release. The configuration will look like:
-
-```yaml
-checkpoint:
-  storage:
-    type: oci  # Not yet supported
-    oci:
-      uri: "oci://myregistry.io/checkpoints"
-      credentialsSecretRef: "registry-creds"  # Docker config secret
-```
-
-## Checkpoint Modes
-
-### Auto Mode (Recommended)
-
-The operator automatically creates a `DynamoCheckpoint` CR if one doesn't exist:
-
-```yaml
-checkpoint:
-  enabled: true
-  mode: auto
-  identity:
-    model: "meta-llama/Llama-3-8B"
-    backendFramework: "vllm"
-    tensorParallelSize: 1
-```
-
-### Reference Mode
-
-Reference an existing `DynamoCheckpoint` CR by its 16-character hash using `checkpointRef`:
-
-```yaml
-checkpoint:
-  enabled: true
-  checkpointRef: "e5962d34ba272638"  # 16-char hash of DynamoCheckpoint CR
-```
-
-This is useful when:
- You want to **pre-warm checkpoints** before creating DGDs
- You want to **explicit control** over which checkpoint to use
-
-**Flow:**
-1. Create a `DynamoCheckpoint` CR (see [DynamoCheckpoint CRD](#dynamocheckpoint-crd) section)
-2. Wait for it to become `Ready`
-3. Reference it in your DGD using `checkpointRef` with the hash
-
-```bash
-# Check checkpoint status (using 16-char hash name)
-kubectl get dynamocheckpoint e5962d34ba272638 -n dynamo-system
-NAME                MODEL                   BACKEND  PHASE  HASH              AGE
-e5962d34ba272638    meta-llama/Llama-3-8B  vllm     Ready  e5962d34ba272638  5m
-
-# Now create DGD referencing it
-kubectl apply -f my-dgd.yaml
-```
-
-## Checkpoint Identity
-
-Checkpoints are uniquely identified by a **16-character SHA256 hash** (64 bits) of configuration that affects runtime state:
-
-| Field | Required | Affects Hash | Example |
-|-------|----------|-------------|---------|
-| `model` | ✓ | ✓ | `meta-llama/Llama-3-8B` |
-| `framework` | ✓ | ✓ | `vllm`, `sglang`, `trtllm` |
-| `dynamoVersion` | | ✓ | `0.9.0`, `1.0.0` |
-| `tensorParallelSize` | | ✓ | `1`, `2`, `4`, `8` (default: 1) |
-| `pipelineParallelSize` | | ✓ | `1`, `2` (default: 1) |
-| `dtype` | | ✓ | `float16`, `bfloat16`, `fp8` |
-| `maxModelLen` | | ✓ | `4096`, `8192` |
-| `extraParameters` | | ✓ | Custom key-value pairs |
-
-**Not included in hash** (don't invalidate checkpoint):
- `replicas`
- `nodeSelector`, `affinity`, `tolerations`
- `resources` (requests/limits)
- Logging/observability config
-
-**Example with all fields:**
-```yaml
-checkpoint:
-  enabled: true
-  mode: auto
-  identity:
-    model: "meta-llama/Llama-3-8B"
-    backendFramework: "vllm"
-    dynamoVersion: "0.9.0"
-    tensorParallelSize: 1
-    pipelineParallelSize: 1
-    dtype: "bfloat16"
-    maxModelLen: 8192
-    extraParameters:
-      enableChunkedPrefill: "true"
-      quantization: "awq"
-```
-
-**Checkpoint Naming:** The `DynamoCheckpoint` CR is automatically named using the 16-character identity hash (e.g., `e5962d34ba272638`).
-
-**Checkpoint Sharing:** Multiple DGDs with the same identity automatically share the same checkpoint.
-
-## DynamoCheckpoint CRD
-
-The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that manages checkpoint lifecycle.
-
-**When to create a DynamoCheckpoint directly:**
- **Pre-warming:** Create checkpoints before deploying DGDs for instant startup
- **Explicit control:** Manage checkpoint lifecycle independently from DGDs
-
-**Note:** With the new hash-based naming, checkpoint names are automatically generated (16-character hash). The operator handles checkpoint discovery and reuse automatically in `auto` mode.
-
-**Create a checkpoint:**
-
-```yaml
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoCheckpoint
-metadata:
-  name: e5962d34ba272638  # Use the computed 16-char hash
-spec:
-  identity:
-    model: "meta-llama/Llama-3-8B"
-    backendFramework: "vllm"
-    tensorParallelSize: 1
-    dtype: "bfloat16"
-
-  job:
-    activeDeadlineSeconds: 3600
-    podTemplateSpec:
-      spec:
-        containers:
-          - name: main
-            image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
-            command: ["python3", "-m", "dynamo.vllm"]
-            args: ["--model", "meta-llama/Llama-3-8B"]
-            resources:
-              limits:
-                nvidia.com/gpu: "1"
-            env:
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: HF_TOKEN
-```
-
-**Note:** You can compute the hash yourself, or use `auto` mode to let the operator create it.
-
-**Check status:**
-
-```bash
-# List all checkpoints
-kubectl get dynamocheckpoint -n dynamo-system
-# Or use shortname
-kubectl get dckpt -n dynamo-system
-
-NAME                MODEL                          BACKEND  PHASE    HASH              AGE
-e5962d34ba272638    meta-llama/Llama-3-8B         vllm     Ready    e5962d34ba272638  5m
-a7b4f89c12de3456    meta-llama/Llama-3-70B        vllm     Creating a7b4f89c12de3456  2m
-```
-
-**Phases:**
-| Phase | Description |
-|-------|-------------|
-| `Pending` | CR created, waiting for job to start |
-| `Creating` | Checkpoint job is running |
-| `Ready` | Checkpoint available for use |
-| `Failed` | Checkpoint creation failed |
-
-**Detailed status:**
-
-```bash
-kubectl describe dckpt e5962d34ba272638 -n dynamo-system
-```
-
-```yaml
-Status:
-  Phase: Ready
-  IdentityHash: e5962d34ba272638
-  Location: /checkpoints/e5962d34ba272638
-  StorageType: pvc
-  CreatedAt: 2026-01-29T10:05:00Z
-```
-
-**Reference from DGD:**
-
-Once the checkpoint is `Ready`, you can reference it by hash:
-
-```yaml
-spec:
-  services:
-    VllmWorker:
-      checkpoint:
-        enabled: true
-        checkpointRef: "e5962d34ba272638"  # 16-char hash
-```
-
-Or use `auto` mode and the operator will find/create it automatically.
-
-## Limitations
-
-⚠️ **Important**: ChReK has significant limitations that impact production readiness:
-
-### Security Considerations
- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function
- Privileged containers have elevated host access, which may violate security policies in many production environments
- This requirement applies to all worker pods that restore from checkpoints
-
-### Technical Limitations
- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
- **Single-node only**: Checkpoints must be created and restored on the same node
- **Single-GPU only**: Multi-GPU configurations are not yet supported
- **Network state**: Active TCP connections are closed during restore (handled with `tcp-close` CRIU option)
- **Storage**: Only PVC backend currently implemented (S3/OCI planned)
-
-### Recommendation
-ChReK is **experimental/beta** and best suited for:
- ✅ Development and testing environments
- ✅ Research and experimentation
- ✅ Controlled production environments with appropriate security controls
- ❌ Security-sensitive production workloads without proper risk assessment
-
-## Troubleshooting
-
-### Checkpoint Not Creating
-
-1. Check the checkpoint job:
-   ```bash
-   kubectl get jobs -l nvidia.com/checkpoint-source=true -n dynamo-system
-   kubectl logs job/checkpoint-<name> -n dynamo-system
-   ```
-
-2. Check the DaemonSet:
-   ```bash
-   kubectl logs daemonset/chrek-agent -n dynamo-system
-   ```
-
-3. Verify storage access:
-   ```bash
-   kubectl exec -it <checkpoint-agent-pod> -- ls -la /checkpoints
-   ```
-
-### Restore Failing
-
-1. Check pod logs:
-   ```bash
-   kubectl logs <worker-pod> -n dynamo-system
-   ```
-
-2. Verify checkpoint file exists:
-   ```bash
-   # For PVC
-   kubectl exec -it <any-pod-with-pvc> -- ls -la /checkpoints/
-
-   # For S3
-   aws s3 ls s3://my-bucket/checkpoints/
-   ```
-
-3. Check environment variables:
-   ```bash
-   kubectl exec <worker-pod> -- env | grep DYN_CHECKPOINT
-   ```
-
-### Cold Start Despite Checkpoint
-
-Pods fall back to cold start if:
- Checkpoint file doesn't exist yet (still being created)
- Checkpoint file is corrupted
- CRIU restore fails
-
-Check logs for "Falling back to cold start" message.
-
-## Best Practices
-
-1. **Use RWX PVCs** for multi-node deployments (currently the only supported backend)
-2. **Pre-warm checkpoints** before scaling up
-3. **Monitor checkpoint size** - large models create large checkpoints
-4. **Clean up old checkpoints** to save storage
-
-## Environment Variables
-
-| Variable | Description |
-|----------|-------------|
-| `DYN_CHECKPOINT_STORAGE_TYPE` | Backend: `pvc`, `s3`, `oci` |
-| `DYN_CHECKPOINT_LOCATION` | Source location (URI) |
-| `DYN_CHECKPOINT_PATH` | Local path to tar file |
-| `DYN_CHECKPOINT_HASH` | Identity hash (debugging) |
-| `DYN_CHECKPOINT_SIGNAL_FILE` | Signal file (creation mode only) |
-
-## Complete Example
-
-Create a checkpoint and use it in a DGD:
-
-```yaml
-# 1. Create the DynamoCheckpoint CR
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoCheckpoint
-metadata:
-  name: e5962d34ba272638  # 16-char hash (computed from identity)
-  namespace: dynamo-system
-spec:
-  identity:
-    model: "meta-llama/Meta-Llama-3-8B-Instruct"
-    backendFramework: "vllm"
-    tensorParallelSize: 1
-    dtype: "bfloat16"
-  job:
-    activeDeadlineSeconds: 3600
-    backoffLimit: 3
-    podTemplateSpec:
-      spec:
-        containers:
-          - name: main
-            image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
-            command: ["python3", "-m", "dynamo.vllm"]
-            args:
-              - "--model"
-              - "meta-llama/Meta-Llama-3-8B-Instruct"
-              - "--tensor-parallel-size"
-              - "1"
-              - "--dtype"
-              - "bfloat16"
-            env:
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: HF_TOKEN
-            resources:
-              limits:
-                nvidia.com/gpu: "1"
-        restartPolicy: Never
---
-# 2. Wait for Ready: kubectl get dckpt e5962d34ba272638 -n dynamo-system -w
---
-# 3. Reference the checkpoint in your DGD
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: my-llm
-  namespace: dynamo-system
-spec:
-  services:
-    VllmWorker:
-      replicas: 2
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
-      resources:
-        limits:
-          nvidia.com/gpu: "1"
-      checkpoint:
-        enabled: true
-        checkpointRef: "e5962d34ba272638"  # Reference by hash
-```
-
-## Related Documentation
-
- [ChReK Overview](README.md) - ChReK architecture and use cases
- [ChReK Standalone Usage Guide](standalone.md) - Use ChReK without Dynamo Platform
- [ChReK Helm Chart README](../../../deploy/helm/charts/chrek/README.md) - Chart configuration
- [Installation Guide](../installation_guide.md) - Platform installation
- [API Reference](../api_reference.md) - Complete CRD specifications
-